From f3411209c9306b00afbee99a4b99c3c35074a997 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 5 Nov 2017 15:27:55 -0800 Subject: [PATCH 001/245] fix bug when iterations are odd --- RUST/transpose.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/RUST/transpose.rs b/RUST/transpose.rs index a315a84ef..5d1ba1e87 100644 --- a/RUST/transpose.rs +++ b/RUST/transpose.rs @@ -154,19 +154,19 @@ fn main() let t1 = timer.elapsed(); let dt = (t1.checked_sub(t0)).unwrap(); let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; - let transpose_time : f64 = dtt as f64 / 1.0e9_f64 as f64; + let transpose_time : f64 = dtt as f64 * 1.0e-9; ////////////////////////////////////////////////////////////////////// /// Analyze and output results ////////////////////////////////////////////////////////////////////// - let addit : usize = (iterations as usize + 1) * (iterations as usize / 2); + let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2; let mut abserr : f64 = 0.0; for i in 0..order { for j in 0..order { let ij = i*order+j; let ji = j*order+i; - let reference : f64 = (ij*(1+iterations as usize)+addit) as f64; + let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64; abserr += (b[ji] - reference).abs(); } } From 4c8025dc0ff757630621b4a6d34843bcf69f0fce Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 3 Jan 2018 12:23:30 -0800 Subject: [PATCH 002/245] initial attempt at OCCA port --- Cxx11/nstream-occa.cc | 178 ++++++++++++++++++++++++++++++++++++++++++ Cxx11/nstream.okl | 7 ++ 2 files changed, 185 insertions(+) create mode 100644 Cxx11/nstream-occa.cc create mode 100644 Cxx11/nstream.okl diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc new file mode 100644 index 000000000..fb85c7b91 --- /dev/null +++ b/Cxx11/nstream-occa.cc @@ -0,0 +1,178 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl; + + occa::device device("mode: 'Serial'"); + //occa::device device("mode: 'OpenMP'"); + //occa::device device("mode: 'OpenCL'"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector h_A; + std::vector h_B; + std::vector h_C; + h_A.resize(length,0.0); + h_B.resize(length,2.0); + h_C.resize(length,2.0); + + // hard-coded in nstream.okl + const double scalar(3); + + occa::memory d_A = device.malloc(length * sizeof(float), h_A); + occa::memory d_B = device.malloc(length * sizeof(float), h_B); + occa::memory d_C = device.malloc(length * sizeof(float), h_C); + + occa::kernel nstream = device.buildKernel("nstream.okl", "nstream"); + { + for (auto iter = 0; iter<=iterations; iter++) { + if (iter==1) nstream_time = prk::wtime(); + nstream(length, d_A, d_B, d_C); + device.finish(); + } + nstream_time = prk::wtime() - nstream_time; + } + occa::memcpy(h_C, d_C); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + double ref(0); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (auto i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/nstream.okl b/Cxx11/nstream.okl new file mode 100644 index 000000000..1539b7a5c --- /dev/null +++ b/Cxx11/nstream.okl @@ -0,0 +1,7 @@ +@kernel void nstream(int N, double * A, const double * B, const double * C) { + for (int group = 0; group < N; group += 64; outer) { + for (int i = group; i < (group + 64); ++i; inner) { + A[i] += B[i] + 3 * C[i]; + } + } +} From 61882b9965694952628947326ef4884bde90ea1f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 3 Jan 2018 16:25:31 -0800 Subject: [PATCH 003/245] OCCA nstream working --- Cxx11/Makefile | 13 +++++++++- Cxx11/nstream-occa.cc | 55 ++++++++++++++++++++++++++++--------------- Cxx11/nstream.okl | 6 +++-- Cxx11/prk_util.h | 4 ++++ 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 86ccdfb6c..e4345d87b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -39,7 +39,7 @@ OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS ORNLACCFLAGS = $(ORNLACCFLAG) -TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) +TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) BOOSTFLAGS = $(BOOSTFLAG) STLFLAGS = $(STLFLAG) $(BOOSTFLAGS) @@ -47,6 +47,11 @@ PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS) RAJAFLAGS = $(RAJAFLAG) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS) +ifdef OCCADIR + include ${OCCADIR}/scripts/makefile +endif +OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath=${OCCADIR}/lib -L${OCCADIR}/lib -locca + .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda EXTRA= @@ -104,6 +109,8 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r cuda: transpose-cuda transpose-cublas nstream-vector-cuda +occa: nstream-occa + p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ @@ -152,6 +159,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-cblas: %-cblas.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(CBLASFLAGS) -o $@ +%-occa: %-occa.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@ + %: %.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ @@ -183,6 +193,7 @@ clean: -rm -f *-cuda -rm -f *-cublas -rm -f *-cblas + -rm -f *-occa -rm -f transpose-vector-async transpose-vector-thread cleancl: diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc index fb85c7b91..4e09578fd 100644 --- a/Cxx11/nstream-occa.cc +++ b/Cxx11/nstream-occa.cc @@ -68,16 +68,16 @@ int main(int argc, char * argv[]) std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl; - occa::device device("mode: 'Serial'"); - //occa::device device("mode: 'OpenMP'"); - //occa::device device("mode: 'OpenCL'"); + occa::device device("mode = Serial"); + //occa::device device("mode = OpenMP"); + //occa::device device("mode = OpenCL, platformID = 0, deviceID = 0"); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters ////////////////////////////////////////////////////////////////////// int iterations, offset; - size_t length; + int length; try { if (argc < 3) { throw "Usage: <# iterations> []"; @@ -88,7 +88,7 @@ int main(int argc, char * argv[]) throw "ERROR: iterations must be >= 1"; } - length = std::atol(argv[2]); + length = std::atoi(argv[2]); if (length <= 0) { throw "ERROR: vector length must be positive"; } @@ -113,30 +113,43 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector h_A; - std::vector h_B; - std::vector h_C; - h_A.resize(length,0.0); - h_B.resize(length,2.0); - h_C.resize(length,2.0); + double * h_A = new double[length]; + double * h_B = new double[length]; + double * h_C = new double[length]; + for (size_t i=0; i epsilon) { std::cout << "Failed Validation on output array\n" diff --git a/Cxx11/nstream.okl b/Cxx11/nstream.okl index 1539b7a5c..fa561facf 100644 --- a/Cxx11/nstream.okl +++ b/Cxx11/nstream.okl @@ -1,7 +1,9 @@ -@kernel void nstream(int N, double * A, const double * B, const double * C) { +@kernel void nstream(int N, double scalar, double * A, const double * B, const double * C) { for (int group = 0; group < N; group += 64; outer) { for (int i = group; i < (group + 64); ++i; inner) { - A[i] += B[i] + 3 * C[i]; + if (i Date: Wed, 3 Jan 2018 16:27:12 -0800 Subject: [PATCH 004/245] add OCCA to make.defs examples --- common/make.defs.gcc | 6 +++++- common/make.defs.intel | 6 +++++- common/make.defs.llvm | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 174da2362..e4ccf911f 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -35,6 +35,10 @@ OPENCLFLAG=-framework OpenCL #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # +# OCCA +# +OCCADIR=${HOME}/prk-repo/Cxx11/occa +# # Cilk # CILKFLAG=-fcilkplus @@ -42,7 +46,7 @@ CILKFLAG=-fcilkplus # TBB # TBBDIR=/usr/local/Cellar/tbb/2018_U1 -TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # diff --git a/common/make.defs.intel b/common/make.defs.intel index 3157acead..d919113b5 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -35,13 +35,17 @@ OFFLOADFLAG=-qopenmp-offload=host OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # +# OCCA +# +OCCADIR=${HOME}/prk-repo/Cxx11/occa +# # Cilk # CILKFLAG=-intel-extensions # default # # TBB # -TBBFLAG=-tbb +TBBFLAG=-USE_TBB -tbb # # Parallel STL, Boost, etc. # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index d8357cd6b..133967dc7 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -54,10 +54,14 @@ OPENCLFLAG=-framework OpenCL #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # +# +# OCCA +# +OCCADIR=${HOME}/prk-repo/Cxx11/occa # TBB # TBBDIR=/usr/local/Cellar/tbb/2018_U1 -TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # From cb4a3e005c887c9694c64f02ce508dee38aa67c2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 3 Jan 2018 21:10:56 -0800 Subject: [PATCH 005/245] make OCCA nicer --- Cxx11/Makefile | 8 ++++---- Cxx11/nstream-occa.cc | 15 ++++++++++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e4345d87b..f37c830ce 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -20,15 +20,12 @@ ifeq ($(USE_PRK_TBB_PARTITIONER),simple) PRK_TBB_PARTITIONER=3 endif ifndef PRK_TBB_PARTITIONER - $(info PRK help: Consider setting USE_PRK_TBB_PARTITIONER={static,affinity,simple} when invoking make) PRK_TBB_PARTITIONER=0 endif # Valid choices are OpenMP, Threads, Serial, Cuda ifdef USE_PRK_KOKKOS_BACKEND KOKKOS_BACKEND_FLAG = -DPRK_KOKKOS_BACKEND=$(USE_PRK_KOKKOS_BACKEND) -else - $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) endif ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm @@ -63,7 +60,7 @@ else EXTRA += target endif -all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl $(EXTRA) +all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl occa $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl @@ -133,6 +130,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OMPFLAGS) -o $@ %-tbb: %-tbb.cc prk_util.h + $(info PRK help: Consider setting USE_PRK_TBB_PARTITIONER={static,affinity,simple} when invoking make) $(CXX) $(CXXFLAGS) $< $(TBBFLAGS) -o $@ %-stl: %-pstl.cc prk_util.h @@ -148,6 +146,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ %-kokkos: %-kokkos.cc prk_util.h + $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) $(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@ %-cuda: %-cuda.cu prk_util.h prk_cuda.h @@ -160,6 +159,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(CBLASFLAGS) -o $@ %-occa: %-occa.cc prk_util.h + $(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.) $(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@ %: %.cc prk_util.h diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc index 4e09578fd..10b0b47fd 100644 --- a/Cxx11/nstream-occa.cc +++ b/Cxx11/nstream-occa.cc @@ -68,9 +68,17 @@ int main(int argc, char * argv[]) std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; std::cout << "C++11/OCCA STREAM triad: A = B + scalar * C" << std::endl; - occa::device device("mode = Serial"); - //occa::device device("mode = OpenMP"); - //occa::device device("mode = OpenCL, platformID = 0, deviceID = 0"); + char* dc = std::getenv("OCCA_DEVICE"); + if (dc==NULL) { + std::cout << "By default, OCCA executes in serial.\n"; + std::cout << "Set OCCA_DEVICE as follows for parallel execution\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenMP\"\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 0, deviceID = 0\" (CPU)\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 1, deviceID = 0\" (GPU)\n"; + std::cout << " OCCA_DEVICE=\"mode = CUDA', deviceID = 0\"\n"; + } + std::string ds = (dc==NULL) ? "mode = Serial" : dc; + occa::device device(ds); ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -106,6 +114,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; std::cout << "Offset = " << offset << std::endl; + std::cout << "OCCA mode = " << "\"" << ds << "\"" << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation From 04307d75b7822c5baaca6939a2213d253b5eca27 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 3 Jan 2018 21:26:41 -0800 Subject: [PATCH 006/245] s/pragma simd/pragma vector/ for ICC [ci skip] --- C1z/prk_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/C1z/prk_util.h b/C1z/prk_util.h index 0777d4863..24b94f9c7 100644 --- a/C1z/prk_util.h +++ b/C1z/prk_util.h @@ -113,7 +113,7 @@ int __cilkrts_get_nworkers(void); #endif #if defined(__INTEL_COMPILER) -# define PRAGMA_SIMD PRAGMA(simd) +# define PRAGMA_SIMD PRAGMA(vector) #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) ) # define PRAGMA_SIMD PRAGMA(GCC ivdep) #elif defined(__clang__) From 26c4b40d7a39174a2a95b85cf962935067bd681c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 4 Jan 2018 15:07:56 -0800 Subject: [PATCH 007/245] cleanup source --- Cxx11/transpose-openmp-target.cc | 8 +++----- Cxx11/transpose-vector.cc | 2 -- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc index c5b707ce5..a611997f5 100644 --- a/Cxx11/transpose-openmp-target.cc +++ b/Cxx11/transpose-openmp-target.cc @@ -71,14 +71,12 @@ int main(int argc, char * argv[]) throw "Usage: <# iterations> [tile size]"; } - // number of times to do the transpose iterations = std::atoi(argv[1]); if (iterations < 1) { throw "ERROR: iterations must be >= 1"; } - // order of a the matrix - order = std::atol(argv[2]); + order = std::atoi(argv[2]); if (order <= 0) { throw "ERROR: Matrix Order must be greater than 0"; } else if (order > std::floor(std::sqrt(INT_MAX))) { @@ -86,7 +84,7 @@ int main(int argc, char * argv[]) } // default tile size for tiling of local transpose - tile_size = (argc>3) ? std::atol(argv[3]) : 32; + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; // a negative tile size means no tiling of the local transpose if (tile_size <= 0) tile_size = order; } @@ -101,7 +99,7 @@ int main(int argc, char * argv[]) std::cout << "Tile size = " << tile_size << std::endl; ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// auto trans_time = 0.0; diff --git a/Cxx11/transpose-vector.cc b/Cxx11/transpose-vector.cc index 6dcf5dbe8..c6199ff40 100644 --- a/Cxx11/transpose-vector.cc +++ b/Cxx11/transpose-vector.cc @@ -71,13 +71,11 @@ int main(int argc, char * argv[]) throw "Usage: <# iterations> [tile size]"; } - // number of times to do the transpose iterations = std::atoi(argv[1]); if (iterations < 1) { throw "ERROR: iterations must be >= 1"; } - // order of a the matrix order = std::atoi(argv[2]); if (order <= 0) { throw "ERROR: Matrix Order must be greater than 0"; From d82afcace9d33e080f1d12837851121462e4ca95 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 4 Jan 2018 15:08:18 -0800 Subject: [PATCH 008/245] OCCA transpose --- Cxx11/Makefile | 2 +- Cxx11/transpose-occa.cc | 189 ++++++++++++++++++++++++++++++++++++++++ Cxx11/transpose.okl | 11 +++ 3 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 Cxx11/transpose-occa.cc create mode 100644 Cxx11/transpose.okl diff --git a/Cxx11/Makefile b/Cxx11/Makefile index f37c830ce..f91ade18b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -106,7 +106,7 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r cuda: transpose-cuda transpose-cublas nstream-vector-cuda -occa: nstream-occa +occa: transpose-occa nstream-occa p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/Cxx11/transpose-occa.cc b/Cxx11/transpose-occa.cc new file mode 100644 index 000000000..5b05b73ce --- /dev/null +++ b/Cxx11/transpose-occa.cc @@ -0,0 +1,189 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// Converted to C++11 by Jeff Hammond, January 2018. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/OCCA Matrix transpose: B = A^T" << std::endl; + + char* dc = std::getenv("OCCA_DEVICE"); + if (dc==NULL) { + std::cout << "By default, OCCA executes in serial.\n"; + std::cout << "Set OCCA_DEVICE as follows for parallel execution\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenMP\"\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 0, deviceID = 0\" (CPU)\n"; + std::cout << " OCCA_DEVICE=\"mode = OpenCL, platformID = 1, deviceID = 0\" (GPU)\n"; + std::cout << " OCCA_DEVICE=\"mode = CUDA', deviceID = 0\"\n"; + } + std::string ds = (dc==NULL) ? "mode = Serial" : dc; + occa::device device(ds); + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "OCCA mode = " << "\"" << ds << "\"" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto trans_time = 0.0; + + double * h_A = new double[order*order]; + double * h_B = new double[order*order]; + for (auto i=0;i(i*order+j); + h_B[i*order+j] = 0.0; + } + } + + occa::memory d_A = device.malloc(order * order * sizeof(double), h_A); + occa::memory d_B = device.malloc(order * order * sizeof(double), h_B); + + d_A.copyFrom(h_A); + d_B.copyFrom(h_B); + + occa::kernel transpose = device.buildKernel("transpose.okl", "transpose"); + + { + for (auto iter = 0; iter<=iterations; iter++) { + if (iter==1) trans_time = prk::wtime(); + transpose(order, d_A, d_B); + device.finish(); + } + trans_time = prk::wtime() - trans_time; + } + + d_B.copyTo(h_B); + + d_A.free(); + d_B.free(); + transpose.free(); + device.free(); + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const auto addit = (iterations+1.) * (iterations/2.); + auto abserr = 0.0; + for (auto j=0; j(ij)*(1.+iterations)+addit; + abserr += std::fabs(h_B[ji] - reference); + } + } + + delete[] h_A; + delete[] h_B; + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = order * order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/transpose.okl b/Cxx11/transpose.okl new file mode 100644 index 000000000..1c8d651dc --- /dev/null +++ b/Cxx11/transpose.okl @@ -0,0 +1,11 @@ +@kernel void transpose(int N, double * A, double * B) +{ + for(int j = 0; j < N; ++j; outer) { + for(int i = 0; i < N; ++i; inner) { + if ((i Date: Thu, 4 Jan 2018 16:03:46 -0800 Subject: [PATCH 009/245] fix type in -DUSE_TBB --- common/make.defs.gcc | 2 +- common/make.defs.intel | 2 +- common/make.defs.llvm | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index e4ccf911f..5e7d18986 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -46,7 +46,7 @@ CILKFLAG=-fcilkplus # TBB # TBBDIR=/usr/local/Cellar/tbb/2018_U1 -TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # diff --git a/common/make.defs.intel b/common/make.defs.intel index d919113b5..0c21efc09 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -45,7 +45,7 @@ CILKFLAG=-intel-extensions # default # # TBB # -TBBFLAG=-USE_TBB -tbb +TBBFLAG=-DUSE_TBB -tbb # # Parallel STL, Boost, etc. # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 133967dc7..b4837451d 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -61,7 +61,7 @@ OCCADIR=${HOME}/prk-repo/Cxx11/occa # TBB # TBBDIR=/usr/local/Cellar/tbb/2018_U1 -TBBFLAG=-USE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # From e604a1384ba526a6fb08ae04b0b04eaf0add5e3e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 9 Jan 2018 15:52:07 -0800 Subject: [PATCH 010/245] fix Travis - USE_TBB flag --- travis/build-run-prk.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 74077e0a7..0a7be419c 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -453,12 +453,12 @@ case "$PRK_TARGET" in Linux) ${CC} --version export TBBFLAG="-I${TBBROOT}/include -L${TBBROOT}/lib/intel64/gcc4.7 -ltbb" - echo "TBBFLAG=${TBBFLAG}" >> common/make.defs + echo "TBBFLAG=-DUSE_TBB ${TBBFLAG}" >> common/make.defs export LD_LIBRARY_PATH=${TBBROOT}/lib/intel64/gcc4.7:${LD_LIBRARY_PATH} ;; Darwin) export TBBFLAG="-I${TBBROOT}/include -L${TBBROOT}/lib -ltbb" - echo "TBBFLAG=${TBBFLAG}" >> common/make.defs + echo "TBBFLAG=-DUSE_TBB ${TBBFLAG}" >> common/make.defs export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH} ;; esac From 51599c33e78f391bab4a99e448c081d44971a65a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 10 Jan 2018 09:04:30 -0800 Subject: [PATCH 011/245] add OCCA to Travis --- travis/build-run-prk.sh | 7 ++++++ travis/install-deps.sh | 1 + travis/install-occa.sh | 51 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) create mode 100644 travis/install-occa.sh diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 0a7be419c..b76414933 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -573,6 +573,13 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/stencil-kokkos 10 200 20 $s $r done done + + # C++ w/ OCCA + echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs + export OCCA_CXX=${PRK_CXX} + make -C $PRK_TARGET_PATH transpose-occa nstream-occa + $PRK_TARGET_PATH/transpose-occa 10 1024 32 + $PRK_TARGET_PATH/nstream-occa 10 16777216 32 ;; allfortran) echo "Fortran" diff --git a/travis/install-deps.sh b/travis/install-deps.sh index d844e1a8f..35e926c76 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -69,6 +69,7 @@ case "$PRK_TARGET" in sh ./travis/install-cmake.sh $TRAVIS_ROOT sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT + sh ./travis/install-occa.sh $TRAVIS_ROOT ;; allfortran) echo "Fortran" diff --git a/travis/install-occa.sh b/travis/install-occa.sh new file mode 100644 index 000000000..3152577e3 --- /dev/null +++ b/travis/install-occa.sh @@ -0,0 +1,51 @@ +#!/bin/sh + +set -e +set -x + +TRAVIS_ROOT="$1" + +case $CXX in + g++) + for major in "-9" "-8" "-7" "-6" "-5" "" ; do + if [ -f "`which ${CXX}${major}`" ]; then + export PRK_CXX="${CXX}${major}" + export PRK_CC="${CC}${major}" + echo "Found C++: $PRK_CXX" + break + fi + done + if [ "x$PRK_CXX" = "x" ] ; then + export PRK_CXX="${CXX}" + export PRK_CC="${CC}" + fi + ;; + clang++) + for version in "-7" "-6" "-5" "-4" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do + if [ -f "`which ${CXX}${version}`" ]; then + export PRK_CXX="${CXX}${version}" + export PRK_CC="${CC}${version}" + echo "Found C++: $PRK_CXX" + break + fi + done + if [ "x$PRK_CXX" = "x" ] ; then + export PRK_CXX="${CXX}" + export PRK_CC="${CC}" + fi + ;; +esac +${PRK_CXX} -v + +if [ ! -d "$TRAVIS_ROOT/occa" ]; then + pushd + cd $TRAVIS_ROOT + BRANCH=develop + git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git + cd occa + CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile + popd +else + echo "OCCA installed..." + find $TRAVIS_ROOT/occa -name occa.hpp +fi From 4aba625eb8dac2445137409f4c2669ab45da4d39 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 10 Jan 2018 11:29:14 -0800 Subject: [PATCH 012/245] simplify OCCA build --- travis/install-occa.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/travis/install-occa.sh b/travis/install-occa.sh index 3152577e3..7462719e4 100644 --- a/travis/install-occa.sh +++ b/travis/install-occa.sh @@ -38,13 +38,9 @@ esac ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/occa" ]; then - pushd - cd $TRAVIS_ROOT BRANCH=develop - git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git - cd occa - CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile - popd + git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa + CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa else echo "OCCA installed..." find $TRAVIS_ROOT/occa -name occa.hpp From 6384587fa5a42f802f0387042fe43a5151d1c6c7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 14:04:49 -0800 Subject: [PATCH 013/245] use size_t instead of int --- Cxx11/nstream-vector-pstl.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index e1fb1ce05..ae67ea494 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -132,14 +132,15 @@ int main(int argc, char * argv[]) std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) - __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { + __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) { #else - std::for_each( std::begin(range), std::end(range), [&] (int i) { + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { #endif - A[i] = 0.0; - B[i] = 2.0; - C[i] = 2.0; - }); + A[i] = 0; + B[i] = 2; + C[i] = 2; + }); + for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) nstream_time = prk::wtime(); @@ -148,9 +149,9 @@ int main(int argc, char * argv[]) std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) - __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { + __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) { #else - std::for_each( std::begin(range), std::end(range), [&] (int i) { + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { #endif A[i] += B[i] + scalar * C[i]; }); From 3b4f5e6bcb034a9034825f1817e938a8bbb29ba6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 14:09:15 -0800 Subject: [PATCH 014/245] add SYCL to examples --- common/make.defs.gcc | 2 ++ common/make.defs.intel | 2 ++ common/make.defs.llvm | 2 ++ 3 files changed, 6 insertions(+) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 4073b48a9..24da56216 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -35,6 +35,8 @@ OPENCLFLAG=-framework OpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL # # Cilk # diff --git a/common/make.defs.intel b/common/make.defs.intel index 4fec2a33a..5e2eea3ea 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -35,6 +35,8 @@ OFFLOADFLAG=-qopenmp-offload=host # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL # # Cilk # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index d8357cd6b..f029e1ceb 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -53,6 +53,8 @@ OPENCLFLAG=-framework OpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL # # TBB # From 010e11feace5914debecf319f025bf0d99c89b1a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 14:10:07 -0800 Subject: [PATCH 015/245] add SYCL nstream --- Cxx11/Makefile | 9 +- Cxx11/nstream-sycl.cc | 204 ++++++++++++++++++++++++++++++++++++++++++ Cxx11/prk_util.h | 4 + 3 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 Cxx11/nstream-sycl.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 093702a48..04e7bae91 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -38,6 +38,7 @@ TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS +SYCLFLAGS = -I$(SYCLDIR)/include -DUSE_SYCL $(BOOSTFLAG) ORNLACCFLAGS = $(ORNLACCFLAG) TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) @@ -58,7 +59,7 @@ else EXTRA += target endif -all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl $(EXTRA) +all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl @@ -90,6 +91,8 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl +sycl: nstream-sycl + tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl @@ -118,6 +121,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-opencl: %-opencl.cc prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ +%-sycl: %-sycl.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(SYCLFLAGS) -o $@ + %-target: %-target.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@ @@ -176,6 +182,7 @@ clean: -rm -f *-target -rm -f *-taskloop -rm -f *-opencl + -rm -f *-sycl -rm -f *-tbb -rm -f *-stl -rm -f *-pstl diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc new file mode 100644 index 000000000..cf9f49373 --- /dev/null +++ b/Cxx11/nstream-sycl.cc @@ -0,0 +1,204 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +// See ParallelSTL.md for important information. + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + // SYCL device queue + cl::sycl::queue q; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector h_A; + std::vector h_B; + std::vector h_C; + h_A.resize(length); + h_B.resize(length); + h_C.resize(length); + + auto range = boost::irange(static_cast(0), length); + + const double scalar(3); + + std::for_each( std::begin(range), std::end(range), [&] (size_t i) { + h_A[i] = 0; + h_B[i] = 2; + h_C[i] = 2; + }); + + { + // initialize device buffers from host buffers + cl::sycl::buffer d_A { std::begin(h_A), std::end(h_A) }; + cl::sycl::buffer d_B { std::begin(h_B), std::end(h_B) }; + cl::sycl::buffer d_C { std::begin(h_C), std::end(h_C) }; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + + // accessor methods + auto A = d_A.get_access(h); + auto B = d_B.get_access(h); + auto C = d_C.get_access(h); + + h.parallel_for(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + A[i] += B[i] + scalar * C[i]; + }); + }); + q.wait(); + } + + d_A.mark_as_written(); + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + nstream_time = prk::wtime() - nstream_time; + + d_A.set_final_data( h_A.begin() ); + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 9d42f3c1d..5804a1bf6 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -212,6 +212,10 @@ const T prk_reduce(I first, I last, T init) { # include "RAJA/RAJA.hpp" #endif +#ifdef USE_SYCL +# include "CL/sycl.hpp" +#endif + #define RESTRICT __restrict__ namespace prk { From 2e4dc458713e4e98598e7d3c5cf9b619daa7e1e5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 15:00:27 -0800 Subject: [PATCH 016/245] fix a bunch of issues in SYCL nstream - tested against ComputeCpp and triSYCL now. - eliminate mark_as_written and set_final_data, which were not present in ComputeCpp (only triSYCL). still figuring out what the right API is... - use better device buffer constructor, i.e. one that is correct all of the time :-) --- Cxx11/Makefile | 4 ++-- Cxx11/nstream-sycl.cc | 9 +++------ 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 04e7bae91..5846d2693 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -38,7 +38,7 @@ TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS -SYCLFLAGS = -I$(SYCLDIR)/include -DUSE_SYCL $(BOOSTFLAG) +SYCLFLAGS = $(SYCLFLAG) $(BOOSTFLAG) ORNLACCFLAGS = $(ORNLACCFLAG) TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) @@ -122,7 +122,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-sycl: %-sycl.cc prk_util.h - $(CXX) $(CXXFLAGS) $< $(SYCLFLAGS) -o $@ + $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ %-target: %-target.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@ diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index cf9f49373..c2684a023 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -134,9 +134,9 @@ int main(int argc, char * argv[]) { // initialize device buffers from host buffers - cl::sycl::buffer d_A { std::begin(h_A), std::end(h_A) }; - cl::sycl::buffer d_B { std::begin(h_B), std::end(h_B) }; - cl::sycl::buffer d_C { std::begin(h_C), std::end(h_C) }; + cl::sycl::buffer d_A { h_A.data(), h_A.size() }; + cl::sycl::buffer d_B { h_B.data(), h_B.size() }; + cl::sycl::buffer d_C { h_C.data(), h_C.size() }; for (auto iter = 0; iter<=iterations; iter++) { @@ -156,13 +156,10 @@ int main(int argc, char * argv[]) q.wait(); } - d_A.mark_as_written(); // Stop timer before buffer+accessor destructors fire, // since that will move data, and we do not time that // for other device-oriented programming models. nstream_time = prk::wtime() - nstream_time; - - d_A.set_final_data( h_A.begin() ); } ////////////////////////////////////////////////////////////////////// From c8ee43331164c376e2e7ccc0ad9c47d8a36dc529 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 16:34:14 -0800 Subject: [PATCH 017/245] name kernel; cleanup --- Cxx11/nstream-sycl.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index c2684a023..306dc7038 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -64,8 +64,6 @@ #include "prk_util.h" -// See ParallelSTL.md for important information. - int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; @@ -149,7 +147,7 @@ int main(int argc, char * argv[]) auto B = d_B.get_access(h); auto C = d_C.get_access(h); - h.parallel_for(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + h.parallel_for(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { A[i] += B[i] + scalar * C[i]; }); }); From 954d504fda805a28576c390cbe0175076b881b0f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 16:36:09 -0800 Subject: [PATCH 018/245] cleanup SYCL in make.defs.* examples --- common/make.defs.gcc | 6 ++++++ common/make.defs.intel | 6 ++++++ common/make.defs.llvm | 15 ++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 24da56216..f355695fe 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -35,8 +35,14 @@ OPENCLFLAG=-framework OpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# +# SYCL flags +# +# triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL +SYCLCXX=${CXX} +SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include # # Cilk # diff --git a/common/make.defs.intel b/common/make.defs.intel index 5e2eea3ea..a31e8a6ae 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -35,8 +35,14 @@ OFFLOADFLAG=-qopenmp-offload=host # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# +# SYCL flags +# +# triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL +SYCLCXX=${CXX} +SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include # # Cilk # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index f029e1ceb..ada326c58 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -53,8 +53,21 @@ OPENCLFLAG=-framework OpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# +# SYCL flags +# +# CodePlay ComputeCpp +SYCLDIR=/opt/sycl/latest +SYCLCXX=${SYCLDIR}/bin/compute++ +SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +# This makes a huge difference in e.g. nstream... +SYCLFLAG+=-no-serial-memop +# +# triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} +#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include # # TBB # From 91eb8f4ffe40eb43cd5a2db5457ff03fdd21a89e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 16:36:25 -0800 Subject: [PATCH 019/245] add SYCL transpose --- Cxx11/Makefile | 2 +- Cxx11/transpose-sycl.cc | 186 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 Cxx11/transpose-sycl.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 5846d2693..ce8af6110 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -91,7 +91,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: nstream-sycl +sycl: nstream-sycl transpose-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc new file mode 100644 index 000000000..80ebb12b0 --- /dev/null +++ b/Cxx11/transpose-sycl.cc @@ -0,0 +1,186 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + // SYCL device queue + cl::sycl::queue q; + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + std::vector h_A; + std::vector h_B; + h_A.resize(order*order); + h_B.resize(order*order,0.0); + + // fill A with the sequence 0 to order^2-1 as doubles + std::iota(h_A.begin(), h_A.end(), 0.0); + + auto range = boost::irange(static_cast(0),order); + + auto trans_time = 0.0; + + { + // initialize device buffers from host buffers +#if USE_2D_INDEXING + cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array + cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array +#else + cl::sycl::buffer d_A { h_A.data(), h_A.size() }; + cl::sycl::buffer d_B { h_B.data(), h_B.size() }; +#endif + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + + // accessor methods + auto A = d_A.get_access(h); + auto B = d_B.get_access(h); + + // transpose + h.parallel_for(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { +#if USE_2D_INDEXING +#error 2D indexing is not implemented yet. Fix this! +#else + B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; + A[it[1] * order + it[0]] += 1.0; +#endif + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + trans_time = prk::wtime() - trans_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const auto addit = (iterations+1.) * (iterations/2.); + auto abserr = 0.0; + for (auto i : range) { + for (auto j : range) { + const int ij = i*order+j; + const int ji = j*order+i; + const double reference = static_cast(ij)*(1.+iterations)+addit; + abserr += std::fabs(h_B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + From 20126c109d1270b8583bc671e3509de3a0e587fa Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 12 Jan 2018 17:50:36 -0800 Subject: [PATCH 020/245] fix occa install hopefully --- travis/install-occa.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/travis/install-occa.sh b/travis/install-occa.sh index 7462719e4..8adb47346 100644 --- a/travis/install-occa.sh +++ b/travis/install-occa.sh @@ -39,7 +39,8 @@ ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/occa" ]; then BRANCH=develop - git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa + git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git + mv occa $TRAVIS_ROOT/occa CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa else echo "OCCA installed..." From 6804a8ae610f02a4692eb5a5879f6bbd2a66f679 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 14:24:13 -0800 Subject: [PATCH 021/245] fix Travis OCCA --- travis/install-occa.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/travis/install-occa.sh b/travis/install-occa.sh index 8adb47346..887366219 100644 --- a/travis/install-occa.sh +++ b/travis/install-occa.sh @@ -38,9 +38,8 @@ esac ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/occa" ]; then - BRANCH=develop - git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git - mv occa $TRAVIS_ROOT/occa + BRANCH="1.0" + git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa else echo "OCCA installed..." From e1c0822f2c5e83aa9bdf73734c9165fdf3617655 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 16:13:59 -0800 Subject: [PATCH 022/245] add stencil-sycl despite bugs stencil kernel is segfaulting due to out-of-bounds error (presumably) --- Cxx11/Makefile | 2 +- Cxx11/stencil-sycl.cc | 263 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 Cxx11/stencil-sycl.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ce8af6110..e91dae03f 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -91,7 +91,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: nstream-sycl transpose-sycl +sycl: stencil-sycl transpose-sycl nstream-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc new file mode 100644 index 000000000..b9708e0df --- /dev/null +++ b/Cxx11/stencil-sycl.cc @@ -0,0 +1,263 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t n; + int radius = 2; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + +#if 0 + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } +#endif + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + +#if 0 + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } +#endif + + // SYCL device queue + cl::sycl::queue q; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + std::vector h_in; + std::vector h_out; + h_in.resize(n*n); + h_out.resize(n*n); + + for (auto i=0; i(i+j); + h_out[i*n+j] = 0.0; + } + } + + { + // initialize device buffers from host buffers + //cl::sycl::buffer d_in { h_in.data(), h_in.size() }; + //cl::sycl::buffer d_out { h_out.data(), h_out.size() }; + cl::sycl::buffer d_in { h_in.data() , cl::sycl::range<2> {n, n} }; + cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + + // accessor methods + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + +#if 0 + // Apply the stencil operator + h.parallel_for(cl::sycl::range<2> {n-2, n-2}, cl::sycl::id<2> {2, 2}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + out[xy] += +in[xy-dx2] * -0.125 + +in[xy-dx1] * -0.25 + +in[xy-dy2] * -0.125 + +in[xy-dx1] * -0.25 + +in[xy+dx1] * 0.25 + +in[xy+dx2] * 0.125 + +in[xy+dy1] * 0.25 + +in[xy+dx2] * 0.125; + }); +#endif + + // Add constant to solution to force refresh of neighbor data, if any + h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {1, 1}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + out[xy] += 1.0; + }); + + }); + q.wait(); + } + stencil_time = prk::wtime() - stencil_time; + } + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + // compute L1 norm in parallel + double norm = 0.0; + for (auto i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} From d5a577ab14b0dbb15bda04857eb82e153565be37 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 16:33:40 -0800 Subject: [PATCH 023/245] use unconstrained transpose blocked transpose by NVIDIA does not work for all dimensions so we will use a slower version that actually works for all dimensions. --- Cxx11/transpose-cuda.cu | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index 5c710942a..ddb22ca92 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -56,6 +56,7 @@ #include "prk_util.h" #include "prk_cuda.h" +#if TILED // The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu, // which is the reason for the additional copyright noted above. @@ -73,6 +74,18 @@ __global__ void transpose(int order, prk_float * A, prk_float * B) A[(y+j)*width + x] += (prk_float)1; } } +#else +__global__ void transpose(unsigned order, prk_float * A, prk_float * B) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i 1234) { + std::cout << "The results are probably going to be wrong, because order>1234.\n"; + } +#endif } catch (const char * e) { std::cout << e << std::endl; @@ -118,8 +139,13 @@ int main(int argc, char * argv[]) std::cout << "Matrix order = " << order << std::endl; std::cout << "Number of iterations = " << iterations << std::endl; +#if TILED dim3 dimGrid(order/tile_dim, order/tile_dim, 1); dim3 dimBlock(tile_dim, block_rows, 1); +#else + dim3 dimGrid(order, order, 1); + dim3 dimBlock(1, 1, 1); +#endif info.checkDims(dimBlock, dimGrid); From c25dc7281e64744423bb9bd64b8bfe1125c7a18a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 17:00:46 -0800 Subject: [PATCH 024/245] debugged SYCL stencil --- Cxx11/stencil-sycl.cc | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index b9708e0df..4d3c83c32 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -190,33 +190,44 @@ int main(int argc, char* argv[]) auto in = d_in.get_access(h); auto out = d_out.get_access(h); -#if 0 // Apply the stencil operator - h.parallel_for(cl::sycl::range<2> {n-2, n-2}, cl::sycl::id<2> {2, 2}, + h.parallel_for(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); +#if 1 cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - out[xy] += +in[xy-dx2] * -0.125 - +in[xy-dx1] * -0.25 - +in[xy-dy2] * -0.125 +#endif + //printf("%zu,%zu\n",xy[0],xy[1]); + out[xy] += 0.0 +#if 1 +in[xy-dx1] * -0.25 +in[xy+dx1] * 0.25 - +in[xy+dx2] * 0.125 + +in[xy-dy1] * -0.25 +in[xy+dy1] * 0.25 - +in[xy+dx2] * 0.125; - }); + +in[xy-dx2] * -0.125 + +in[xy+dx2] * 0.125 + +in[xy-dy2] * -0.125 + +in[xy+dy2] * 0.125 #endif + ; + }); + }); + + q.submit([&](cl::sycl::handler& h) { + // accessor methods + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {1, 1}, + h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += 1.0; + in[xy] += 1.0; }); - }); q.wait(); } From e0898dda89255081c8a8ec258a001875e5bb2a2b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 20:25:48 -0800 Subject: [PATCH 025/245] refactor stencil-sycl --- Cxx11/stencil-sycl.cc | 93 +++++++++++++++++++++++-------------------- 1 file changed, 50 insertions(+), 43 deletions(-) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 4d3c83c32..552e11082 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -1,6 +1,6 @@ /// -/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2017, Intel Corporation /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -61,6 +61,45 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "stencil_sycl.hpp" + +void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer d_in, cl::sycl::buffer d_out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + std::abort(); +} + +void star2(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) +{ + q.submit([&](cl::sycl::handler& h) { + + // accessor methods + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + + // Apply the stencil operator + h.parallel_for(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + out[xy] += +in[xy-dx1] * -0.25 + +in[xy+dx1] * 0.25 + +in[xy-dy1] * -0.25 + +in[xy+dy1] * 0.25 + +in[xy-dx2] * -0.125 + +in[xy+dx2] * 0.125 + +in[xy-dy2] * -0.125 + +in[xy+dy2] * 0.125; + }); + }); +} int main(int argc, char* argv[]) { @@ -131,26 +170,24 @@ int main(int argc, char* argv[]) std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; std::cout << "Radius of stencil = " << radius << std::endl; -#if 0 auto stencil = nothing; if (star) { switch (radius) { - case 1: stencil = star1; break; + //case 1: stencil = star1; break; case 2: stencil = star2; break; - case 3: stencil = star3; break; - case 4: stencil = star4; break; - case 5: stencil = star5; break; + //case 3: stencil = star3; break; + //case 4: stencil = star4; break; + //case 5: stencil = star5; break; } } else { switch (radius) { - case 1: stencil = grid1; break; - case 2: stencil = grid2; break; - case 3: stencil = grid3; break; - case 4: stencil = grid4; break; - case 5: stencil = grid5; break; + //case 1: stencil = grid1; break; + //case 2: stencil = grid2; break; + //case 3: stencil = grid3; break; + //case 4: stencil = grid4; break; + //case 5: stencil = grid5; break; } } -#endif // SYCL device queue cl::sycl::queue q; @@ -184,37 +221,7 @@ int main(int argc, char* argv[]) if (iter==1) stencil_time = prk::wtime(); - q.submit([&](cl::sycl::handler& h) { - - // accessor methods - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); - - // Apply the stencil operator - h.parallel_for(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2}, - [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); -#if 1 - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); -#endif - //printf("%zu,%zu\n",xy[0],xy[1]); - out[xy] += 0.0 -#if 1 - +in[xy-dx1] * -0.25 - +in[xy+dx1] * 0.25 - +in[xy-dy1] * -0.25 - +in[xy+dy1] * 0.25 - +in[xy-dx2] * -0.125 - +in[xy+dx2] * 0.125 - +in[xy-dy2] * -0.125 - +in[xy+dy2] * 0.125 -#endif - ; - }); - }); + star2(q, n, d_in, d_out); q.submit([&](cl::sycl::handler& h) { From 423edd4f3075e9f6434b17a2949121a70b324e85 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 13 Jan 2018 21:10:18 -0800 Subject: [PATCH 026/245] create and use SYCL code generator - standalone code generator that improves in some ways on the cxx one. - star2 behaves the same, other star stencils validate. - did not implement grid, because it's still buggy in other impls anyways. fix that bug (bookkeeping) first. --- Cxx11/generate-sycl-stencil.py | 78 +++++++++++++++++ Cxx11/stencil-sycl.cc | 16 ++-- Cxx11/stencil_sycl.hpp | 155 +++++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 8 deletions(-) create mode 100755 Cxx11/generate-sycl-stencil.py create mode 100644 Cxx11/stencil_sycl.hpp diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py new file mode 100755 index 000000000..8a8f44ddb --- /dev/null +++ b/Cxx11/generate-sycl-stencil.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +import sys +import fileinput +import string +import os + +def codegen(src,pattern,stencil_size,radius,W,model): + src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n') + src.write(' cl::sycl::buffer d_in,\n') + src.write(' cl::sycl::buffer d_out) {\n') + src.write(' q.submit([&](cl::sycl::handler& h) {\n') + src.write(' auto in = d_in.get_access(h); \n') + src.write(' auto out = d_out.get_access(h);\n') + src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') + src.write(' [=] (cl::sycl::item<2> it) {\n') + src.write(' cl::sycl::id<2> xy = it.get_id();\n') + for r in range(1,radius+1): + src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') + src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') + src.write(' out[xy] += ') + if pattern == 'star': + for i in range(1,radius+1): + if i > 1: + src.write('\n') + src.write(19*' ') + src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius))) + if i == radius: + src.write(';\n') + else: + print('grid not implemented\n') + src.write(' });\n') + src.write(' });\n') + src.write('}\n\n') + +def instance(src,model,pattern,r): + + W = [[0.0e0 for x in range(2*r+1)] for x in range(2*r+1)] + if pattern == 'star': + stencil_size = 4*r+1 + for i in range(1,r+1): + W[r][r+i] = +1./(2*i*r) + W[r+i][r] = +1./(2*i*r) + W[r][r-i] = -1./(2*i*r) + W[r-i][r] = -1./(2*i*r) + + else: + stencil_size = (2*r+1)**2 + for j in range(1,r+1): + for i in range(-j+1,j): + W[r+i][r+j] = +1./(4*j*(2*j-1)*r) + W[r+i][r-j] = -1./(4*j*(2*j-1)*r) + W[r+j][r+i] = +1./(4*j*(2*j-1)*r) + W[r-j][r+i] = -1./(4*j*(2*j-1)*r) + + W[r+j][r+j] = +1./(4*j*r) + W[r-j][r-j] = -1./(4*j*r) + + codegen(src,pattern,stencil_size,r,W,model) + +def main(): + for model in ['sycl']: + src = open('stencil_'+model+'.hpp','w') + #for pattern in ['star','grid']: + for pattern in ['star']: + for r in range(1,6): + instance(src,model,pattern,r) + src.close() + +if __name__ == '__main__': + main() + diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 552e11082..fcd193298 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -71,6 +71,7 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer d_ std::abort(); } +#if 0 void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer d_in, cl::sycl::buffer d_out) @@ -100,6 +101,7 @@ void star2(cl::sycl::queue & q, const size_t n, }); }); } +#endif int main(int argc, char* argv[]) { @@ -141,6 +143,7 @@ int main(int argc, char* argv[]) if (tile_size <= 0) tile_size = n; if (tile_size > n) tile_size = n; } +#endif // stencil pattern if (argc > 4) { @@ -158,7 +161,6 @@ int main(int argc, char* argv[]) if ( (radius < 1) || (2*radius+1 > n) ) { throw "ERROR: Stencil radius negative or too large"; } -#endif } catch (const char * e) { std::cout << e << std::endl; @@ -173,11 +175,11 @@ int main(int argc, char* argv[]) auto stencil = nothing; if (star) { switch (radius) { - //case 1: stencil = star1; break; + case 1: stencil = star1; break; case 2: stencil = star2; break; - //case 3: stencil = star3; break; - //case 4: stencil = star4; break; - //case 5: stencil = star5; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; } } else { switch (radius) { @@ -212,8 +214,6 @@ int main(int argc, char* argv[]) { // initialize device buffers from host buffers - //cl::sycl::buffer d_in { h_in.data(), h_in.size() }; - //cl::sycl::buffer d_out { h_out.data(), h_out.size() }; cl::sycl::buffer d_in { h_in.data() , cl::sycl::range<2> {n, n} }; cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; @@ -230,7 +230,7 @@ int main(int argc, char* argv[]) auto out = d_out.get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, + h.parallel_for(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); in[xy] += 1.0; diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp new file mode 100644 index 000000000..845082e62 --- /dev/null +++ b/Cxx11/stencil_sycl.hpp @@ -0,0 +1,155 @@ +void star1(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + out[xy] += +in[xy+dx1] * 0.5 + +in[xy+dy1] * 0.5 + +in[xy-dx1] * -0.5 + +in[xy-dy1] * -0.5; + }); + }); +} + +void star2(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + out[xy] += +in[xy+dx1] * 0.25 + +in[xy+dy1] * 0.25 + +in[xy-dx1] * -0.25 + +in[xy-dy1] * -0.25 + +in[xy+dx2] * 0.125 + +in[xy+dy2] * 0.125 + +in[xy-dx2] * -0.125 + +in[xy-dy2] * -0.125; + }); + }); +} + +void star3(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + out[xy] += +in[xy+dx1] * 0.16666666666666666 + +in[xy+dy1] * 0.16666666666666666 + +in[xy-dx1] * -0.16666666666666666 + +in[xy-dy1] * -0.16666666666666666 + +in[xy+dx2] * 0.08333333333333333 + +in[xy+dy2] * 0.08333333333333333 + +in[xy-dx2] * -0.08333333333333333 + +in[xy-dy2] * -0.08333333333333333 + +in[xy+dx3] * 0.05555555555555555 + +in[xy+dy3] * 0.05555555555555555 + +in[xy-dx3] * -0.05555555555555555 + +in[xy-dy3] * -0.05555555555555555; + }); + }); +} + +void star4(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); + cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); + out[xy] += +in[xy+dx1] * 0.125 + +in[xy+dy1] * 0.125 + +in[xy-dx1] * -0.125 + +in[xy-dy1] * -0.125 + +in[xy+dx2] * 0.0625 + +in[xy+dy2] * 0.0625 + +in[xy-dx2] * -0.0625 + +in[xy-dy2] * -0.0625 + +in[xy+dx3] * 0.041666666666666664 + +in[xy+dy3] * 0.041666666666666664 + +in[xy-dx3] * -0.041666666666666664 + +in[xy-dy3] * -0.041666666666666664 + +in[xy+dx4] * 0.03125 + +in[xy+dy4] * 0.03125 + +in[xy-dx4] * -0.03125 + +in[xy-dy4] * -0.03125; + }); + }); +} + +void star5(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer d_in, + cl::sycl::buffer d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); + cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); + cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); + cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); + out[xy] += +in[xy+dx1] * 0.1 + +in[xy+dy1] * 0.1 + +in[xy-dx1] * -0.1 + +in[xy-dy1] * -0.1 + +in[xy+dx2] * 0.05 + +in[xy+dy2] * 0.05 + +in[xy-dx2] * -0.05 + +in[xy-dy2] * -0.05 + +in[xy+dx3] * 0.03333333333333333 + +in[xy+dy3] * 0.03333333333333333 + +in[xy-dx3] * -0.03333333333333333 + +in[xy-dy3] * -0.03333333333333333 + +in[xy+dx4] * 0.025 + +in[xy+dy4] * 0.025 + +in[xy-dx4] * -0.025 + +in[xy-dy4] * -0.025 + +in[xy+dx5] * 0.02 + +in[xy+dy5] * 0.02 + +in[xy-dx5] * -0.02 + +in[xy-dy5] * -0.02; + }); + }); +} + From 8dc644e51f6c8e40c4aa2fdb2d6aca231c42a154 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Jan 2018 21:10:36 -0800 Subject: [PATCH 027/245] no clue why OCCA is so hard to get runnig in Travis --- travis/install-occa.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/install-occa.sh b/travis/install-occa.sh index 887366219..810580cd0 100644 --- a/travis/install-occa.sh +++ b/travis/install-occa.sh @@ -40,7 +40,7 @@ ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/occa" ]; then BRANCH="1.0" git clone --recursive --depth 1 -b ${BRANCH} https://github.com/libocca/occa.git $TRAVIS_ROOT/occa - CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -f makefile -C $TRAVIS_ROOT/occa + CXX=${PRK_CXX} OCCA_CUDA_ENABLED=0 OCCA_FORTRAN_ENABLED=0 make -C $TRAVIS_ROOT/occa else echo "OCCA installed..." find $TRAVIS_ROOT/occa -name occa.hpp From 25b5c7c8d6dfeb628d404f191c3399423c267328 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 15 Jan 2018 21:12:41 -0800 Subject: [PATCH 028/245] workaround Mac ld --- travis/build-run-prk.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index b76414933..10a1841f6 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -575,11 +575,14 @@ case "$PRK_TARGET" in done # C++ w/ OCCA - echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs - export OCCA_CXX=${PRK_CXX} - make -C $PRK_TARGET_PATH transpose-occa nstream-occa - $PRK_TARGET_PATH/transpose-occa 10 1024 32 - $PRK_TARGET_PATH/nstream-occa 10 16777216 32 + # OCCA sets -Wl,-rpath=${OCCA_LIB}, which chokes Mac's ld. + if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs + export OCCA_CXX=${PRK_CXX} + make -C $PRK_TARGET_PATH transpose-occa nstream-occa + $PRK_TARGET_PATH/transpose-occa 10 1024 32 + $PRK_TARGET_PATH/nstream-occa 10 16777216 32 + fi ;; allfortran) echo "Fortran" From 34e99ad5baffd409f13b35cc7b69c2a99ba17f96 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Jan 2018 09:37:27 -0800 Subject: [PATCH 029/245] improve SYCL stencil - use size_t instead of int so i can use nice initializers for ranges - fix bug in accessors where in was read instead of read_write - do initialization of on the device in SYCL instead of host. --- Cxx11/stencil-sycl.cc | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index fcd193298..74a8a2801 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -114,7 +114,7 @@ int main(int argc, char* argv[]) int iterations; size_t n; - int radius = 2; + size_t radius = 2; bool star = true; try { if (argc < 3) { @@ -200,23 +200,36 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector h_in; std::vector h_out; - h_in.resize(n*n); - h_out.resize(n*n); + h_out.resize(n*n,0.0); for (auto i=0; i(i+j); h_out[i*n+j] = 0.0; } } { // initialize device buffers from host buffers - cl::sycl::buffer d_in { h_in.data() , cl::sycl::range<2> {n, n} }; + cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; + q.submit([&](cl::sycl::handler& h) { + + // accessor methods + auto in = d_in.get_access(h); + + // Add constant to solution to force refresh of neighbor data, if any + h.parallel_for(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0}, + [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> xy = it.get_id(); + auto i = xy[0]; + auto j = xy[1]; + in[xy] = static_cast(i+j); + }); + }); + q.wait(); + for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) stencil_time = prk::wtime(); @@ -226,7 +239,7 @@ int main(int argc, char* argv[]) q.submit([&](cl::sycl::handler& h) { // accessor methods - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); // Add constant to solution to force refresh of neighbor data, if any @@ -246,7 +259,7 @@ int main(int argc, char* argv[]) ////////////////////////////////////////////////////////////////////// // interior of grid with respect to stencil - size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + auto active_points = (n-2L*radius)*(n-2L*radius); // compute L1 norm in parallel double norm = 0.0; @@ -270,8 +283,8 @@ int main(int argc, char* argv[]) std::cout << "L1 norm = " << norm << " Reference L1 norm = " << reference_norm << std::endl; #endif - const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); - size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*stencil_size+1L) * active_points; auto avgtime = stencil_time/iterations; std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime << " Avg time (s): " << avgtime << std::endl; From ac9bbed2dffeae4e8f505b2edee1dbcd9e828bad Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 17 Jan 2018 21:28:13 -0800 Subject: [PATCH 030/245] disable OCCA in Travis [ci skip] --- travis/build-run-prk.sh | 14 +++++++------- travis/install-deps.sh | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 619d5a00d..41a34612f 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -578,13 +578,13 @@ case "$PRK_TARGET" in # C++ w/ OCCA # OCCA sets -Wl,-rpath=${OCCA_LIB}, which chokes Mac's ld. - if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs - export OCCA_CXX=${PRK_CXX} - make -C $PRK_TARGET_PATH transpose-occa nstream-occa - $PRK_TARGET_PATH/transpose-occa 10 1024 32 - $PRK_TARGET_PATH/nstream-occa 10 16777216 32 - fi + #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + # echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs + # export OCCA_CXX=${PRK_CXX} + # make -C $PRK_TARGET_PATH transpose-occa nstream-occa + # $PRK_TARGET_PATH/transpose-occa 10 1024 32 + # $PRK_TARGET_PATH/nstream-occa 10 16777216 32 + #fi ;; allfortran) echo "Fortran" diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 35e926c76..42b620858 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -69,7 +69,7 @@ case "$PRK_TARGET" in sh ./travis/install-cmake.sh $TRAVIS_ROOT sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT - sh ./travis/install-occa.sh $TRAVIS_ROOT + #sh ./travis/install-occa.sh $TRAVIS_ROOT ;; allfortran) echo "Fortran" From 2de3b7774e1700f9aeaedf4f0d77b6a932f8beb7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 12:18:38 -0800 Subject: [PATCH 031/245] fix banner error [ci skip] --- Cxx11/transpose-kokkos.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index 7a7543b04..268d9e19a 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -54,7 +54,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl; + std::cout << "C++11 Matrix transpose: B = A^T" << std::endl; Kokkos::initialize(argc, argv); From 4bd6a3c4900d44c070db91a4688c78aeb8f40d62 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 12:18:38 -0800 Subject: [PATCH 032/245] fix banner error [ci skip] --- Cxx11/transpose-kokkos.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index 7a7543b04..268d9e19a 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -54,7 +54,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl; + std::cout << "C++11 Matrix transpose: B = A^T" << std::endl; Kokkos::initialize(argc, argv); From 9633215c36b1c58ea24bc76e74fff866a37178a0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 13:26:15 -0800 Subject: [PATCH 033/245] detect bad input settings [ci skip] --- Cxx11/transpose-vector-async.cc | 8 +++++++- Cxx11/transpose-vector-thread.cc | 10 ++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc index 43040c389..74a349cd2 100644 --- a/Cxx11/transpose-vector-async.cc +++ b/Cxx11/transpose-vector-async.cc @@ -109,8 +109,14 @@ int main(int argc, char * argv[]) std::cout << "Block size = " << block_size << std::endl; std::cout << "Tile size = " << tile_size << std::endl; + if (num_futures > 300) { + std::cout << "These settings may lead to resource exhaustion.\n" + << "Please use a larger block size.\n"; + return 1; + } + ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// std::vector A; diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc index c6c60f8b2..8ce3d79c7 100644 --- a/Cxx11/transpose-vector-thread.cc +++ b/Cxx11/transpose-vector-thread.cc @@ -89,6 +89,10 @@ int main(int argc, char * argv[]) throw "ERROR: block size must be greater than 0"; } + if (order / block_size > 16) { + throw "ERROR: this will create more than 256 threads"; + } + // default tile size for tiling of local transpose tile_size = (argc>4) ? std::atoi(argv[4]) : 32; // a negative tile size means no tiling of the local transpose @@ -109,6 +113,12 @@ int main(int argc, char * argv[]) std::cout << "Block size = " << block_size << std::endl; std::cout << "Tile size = " << tile_size << std::endl; + if (num_threads > 300) { + std::cout << "These settings may lead to resource exhaustion.\n" + << "Please use a larger block size.\n"; + return 1; + } + ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// From 843aab9ec2fd02bcbe181f284565be416bf6f7cb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 13:30:39 -0800 Subject: [PATCH 034/245] add SYCL to docs --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 478a93e34..39c5cd89e 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,7 @@ i = in-progress, incomplete, or incorrect | OpenMP tasks | y | y | y | y | | | | OpenMP target | y | y | y | y | | | | OpenCL 1.x | i | y | y | y | | | +| SYCL | | y | y | y | | | | Parallel STL | y | y | y | y | | | | TBB | i | y | y | y | | | | Kokkos | y | y | y | y | | | From f713445124e0613a3a88a4795514e46bef86c250 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 13:31:36 -0800 Subject: [PATCH 035/245] add CUDA nstream to docs [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 478a93e34..c7256cb7a 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,8 @@ i = in-progress, incomplete, or incorrect | TBB | i | y | y | y | | | | Kokkos | y | y | y | y | | | | RAJA | y | y | y | y | | | -| CUDA | | | y | | | | -| CUBLAS | | | y | | | | +| CUDA | | | y | y | | | +| CUBLAS | | | y | y | | | | CBLAS | | | | | | y | * [TBB](https://www.threadingbuildingblocks.org/) From 7de1f05fec65503bd2e39365590eae322630c8d9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Jan 2018 16:25:58 -0800 Subject: [PATCH 036/245] fix banner [ci skip] --- Cxx11/nstream-vector-pstl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index e1fb1ce05..c243f0ff1 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -70,7 +70,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #if defined(USE_PSTL) - std::cout << "C++17 STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "C++17 Parallel STL STREAM triad: A = B + scalar * C" << std::endl; #else std::cout << "C++11 STL STREAM triad: A = B + scalar * C" << std::endl; #endif From a27f728b8665d3048446e7cf713bc140495e6f39 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Jan 2018 16:50:05 -0800 Subject: [PATCH 037/245] add missing block size to C++11 thread+async transpose --- travis/build-run-prk.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 41a34612f..97a91b2bb 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -349,8 +349,8 @@ case "$PRK_TARGET" in # C++11 native parallelism make -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async - $PRK_TARGET_PATH/transpose-vector-thread 10 1024 32 - $PRK_TARGET_PATH/transpose-vector-async 10 1024 32 + $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32 + $PRK_TARGET_PATH/transpose-vector-async 10 1024 512 32 # C++11 with rangefor echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs From a318b234fa775d75cb53c8b21250cd1081e9252f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 21 Jan 2018 12:12:32 -0800 Subject: [PATCH 038/245] no need to resize stl::vector so stop it --- Cxx11/dgemm-cblas.cc | 9 +++------ Cxx11/dgemm-vector.cc | 9 +++------ Cxx11/nstream-opencl.cc | 9 +++------ Cxx11/nstream-sycl.cc | 13 +++++-------- Cxx11/nstream-vector-openmp.cc | 9 +++------ Cxx11/nstream-vector-pstl.cc | 9 +++------ Cxx11/nstream-vector-raja.cc | 9 +++------ Cxx11/nstream-vector-rangefor.cc | 9 +++------ Cxx11/nstream-vector-taskloop.cc | 9 +++------ Cxx11/nstream-vector-tbb.cc | 9 +++------ Cxx11/nstream-vector.cc | 9 +++------ 11 files changed, 35 insertions(+), 68 deletions(-) diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index a239102a2..5fe2c5ab9 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -143,12 +143,9 @@ int main(int argc, char * argv[]) double dgemm_time(0); - std::vector A; - std::vector B; - std::vector C; - A.resize(order*order); - B.resize(order*order); - C.resize(order*order,0.0); + std::vector A(order*order); + std::vector B(order*order); + std::vector C(order*order,0.0); #ifdef PRK_DEBUG const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::default_random_engine generator(seed); diff --git a/Cxx11/dgemm-vector.cc b/Cxx11/dgemm-vector.cc index 7cb102194..973c0df97 100644 --- a/Cxx11/dgemm-vector.cc +++ b/Cxx11/dgemm-vector.cc @@ -156,12 +156,9 @@ int main(int argc, char * argv[]) double dgemm_time(0); - std::vector A; - std::vector B; - std::vector C; - A.resize(order*order); - B.resize(order*order); - C.resize(order*order,0.0); + std::vector A(order*order); + std::vector B(order*order); + std::vector C(order*order,0.0); for (auto i=0; i h_a; - std::vector h_b; - std::vector h_c; - h_a.resize(length, (T)0); - h_b.resize(length, (T)2); - h_c.resize(length, (T)2); + std::vector h_a(length, T(0)); + std::vector h_b(length, T(2)); + std::vector h_c(length, T(2)); // copy input from host to device cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true); diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 306dc7038..b21c73593 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -113,12 +113,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector h_A; - std::vector h_B; - std::vector h_C; - h_A.resize(length); - h_B.resize(length); - h_C.resize(length); + std::vector h_A(length); + std::vector h_B(length); + std::vector h_C(length); auto range = boost::irange(static_cast(0), length); @@ -137,9 +134,9 @@ int main(int argc, char * argv[]) cl::sycl::buffer d_C { h_C.data(), h_C.size() }; for (auto iter = 0; iter<=iterations; iter++) { - + if (iter==1) nstream_time = prk::wtime(); - + q.submit([&](cl::sycl::handler& h) { // accessor methods diff --git a/Cxx11/nstream-vector-openmp.cc b/Cxx11/nstream-vector-openmp.cc index d62c9000b..d48015df6 100644 --- a/Cxx11/nstream-vector-openmp.cc +++ b/Cxx11/nstream-vector-openmp.cc @@ -117,12 +117,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length); - B.resize(length); - C.resize(length); + std::vector A(length); + std::vector B(length); + std::vector C(length); double scalar = 3.0; diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index ae67ea494..6ba74fe57 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -116,12 +116,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length); - B.resize(length); - C.resize(length); + std::vector A(length); + std::vector B(length); + std::vector C(length); auto range = boost::irange(static_cast(0), length); diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc index 91f87b93c..31c6434e7 100644 --- a/Cxx11/nstream-vector-raja.cc +++ b/Cxx11/nstream-vector-raja.cc @@ -119,12 +119,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length); - B.resize(length); - C.resize(length); + std::vector A(length); + std::vector B(length); + std::vector C(length); double scalar = 3.0; diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc index 92feef10e..54bad9274 100644 --- a/Cxx11/nstream-vector-rangefor.cc +++ b/Cxx11/nstream-vector-rangefor.cc @@ -112,12 +112,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length,0.0); - B.resize(length,2.0); - C.resize(length,2.0); + std::vector A(length,0.0); + std::vector B(length,2.0); + std::vector C(length,2.0); auto range = boost::irange(static_cast(0), length); diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc index 272047a54..dede73b16 100644 --- a/Cxx11/nstream-vector-taskloop.cc +++ b/Cxx11/nstream-vector-taskloop.cc @@ -124,12 +124,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length); - B.resize(length); - C.resize(length); + std::vector A(length); + std::vector B(length); + std::vector C(length); double scalar = 3.0; diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc index da78be1df..cb73d3eda 100644 --- a/Cxx11/nstream-vector-tbb.cc +++ b/Cxx11/nstream-vector-tbb.cc @@ -116,12 +116,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length); - B.resize(length); - C.resize(length); + std::vector A(length); + std::vector B(length); + std::vector C(length); double scalar(3); diff --git a/Cxx11/nstream-vector.cc b/Cxx11/nstream-vector.cc index bf5879ca2..c19938da3 100644 --- a/Cxx11/nstream-vector.cc +++ b/Cxx11/nstream-vector.cc @@ -109,12 +109,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A; - std::vector B; - std::vector C; - A.resize(length,0.0); - B.resize(length,2.0); - C.resize(length,2.0); + std::vector A(length,0.0); + std::vector B(length,2.0); + std::vector C(length,2.0); double scalar = 3.0; From 75ff0496031d11ea685ca1b7c3c1b3814812af27 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 21 Jan 2018 12:22:10 -0800 Subject: [PATCH 039/245] no need to resize stl::vector so stop it --- Cxx11/sparse-vector.cc | 13 ++++--------- Cxx11/stencil-opencl.cc | 6 ++---- Cxx11/stencil-sycl.cc | 9 +-------- Cxx11/transpose-opencl.cc | 7 +++---- Cxx11/transpose-sycl.cc | 6 ++---- Cxx11/transpose-vector-async.cc | 7 +++---- Cxx11/transpose-vector-openmp.cc | 6 ++---- Cxx11/transpose-vector-pstl.cc | 7 +++---- Cxx11/transpose-vector-raja.cc | 6 ++---- Cxx11/transpose-vector-rangefor.cc | 6 ++---- Cxx11/transpose-vector-taskloop.cc | 6 ++---- Cxx11/transpose-vector-tbb.cc | 6 ++---- Cxx11/transpose-vector-thread.cc | 7 +++---- Cxx11/transpose-vector.cc | 7 +++---- 14 files changed, 34 insertions(+), 65 deletions(-) diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc index b4c7445b6..c521528e8 100644 --- a/Cxx11/sparse-vector.cc +++ b/Cxx11/sparse-vector.cc @@ -158,15 +158,10 @@ int main(int argc, char* argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector matrix; - std::vector colIndex; - std::vector vector; - std::vector result; - - matrix.resize(nent,0.0); - colIndex.resize(nent,0); - vector.resize(size2,0.0); - result.resize(size2,0.0); + std::vector matrix(nent,0.0); + std::vector colIndex(nent,0); + std::vector vector(size2,0.0); + std::vector result(size2,0.0); double sparse_time(0); diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc index e75c416de..89a261cc9 100644 --- a/Cxx11/stencil-opencl.cc +++ b/Cxx11/stencil-opencl.cc @@ -107,10 +107,8 @@ void run(cl::Context context, int iterations, int n, int radius, bool star) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector h_in; - std::vector h_out; - h_in.resize(n*n, (T)0); - h_out.resize(n*n, (T)0); + std::vector h_in(n*n, T(0)); + std::vector h_out(n*n, T(0)); auto stencil_time = 0.0; diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 74a8a2801..f1ecb8abe 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -200,14 +200,7 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector h_out; - h_out.resize(n*n,0.0); - - for (auto i=0; i h_out(n*n,0.0); { // initialize device buffers from host buffers diff --git a/Cxx11/transpose-opencl.cc b/Cxx11/transpose-opencl.cc index 7f632f297..4e22114d5 100644 --- a/Cxx11/transpose-opencl.cc +++ b/Cxx11/transpose-opencl.cc @@ -78,10 +78,9 @@ void run(cl::Context context, int iterations, int order) ////////////////////////////////////////////////////////////////////// const size_t nelems = (size_t)order * (size_t)order; - std::vector h_a; - std::vector h_b; - h_a.resize(nelems); - h_b.resize(nelems, (T)0); + std::vector h_a(nelems); + std::vector h_b(nelems, T(0)); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(h_a.begin(), h_a.end(), (T)0); diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 80ebb12b0..cbbc1a2a1 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -96,10 +96,8 @@ int main(int argc, char * argv[]) /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - std::vector h_A; - std::vector h_B; - h_A.resize(order*order); - h_B.resize(order*order,0.0); + std::vector h_A(order*order); + std::vector h_B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(h_A.begin(), h_A.end(), 0.0); diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc index 74a349cd2..8f285b1ad 100644 --- a/Cxx11/transpose-vector-async.cc +++ b/Cxx11/transpose-vector-async.cc @@ -119,10 +119,9 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - B.resize(order*order,0.0); - A.resize(order*order); + std::vector A(order*order); + std::vector B(order*order,0.0); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); diff --git a/Cxx11/transpose-vector-openmp.cc b/Cxx11/transpose-vector-openmp.cc index aaaf4e5ab..4e02d09bb 100644 --- a/Cxx11/transpose-vector-openmp.cc +++ b/Cxx11/transpose-vector-openmp.cc @@ -112,10 +112,8 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A; - std::vector B; - A.resize(order*order); - B.resize(order*order); + std::vector A(order*order); + std::vector B(order*order); OMP_PARALLEL() { diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index 616c94d17..8b9734200 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -99,10 +99,9 @@ int main(int argc, char * argv[]) /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - B.resize(order*order,0.0); - A.resize(order*order); + std::vector A(order*order); + std::vector B(order*order,0.0); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc index 7b183913a..84738694d 100644 --- a/Cxx11/transpose-vector-raja.cc +++ b/Cxx11/transpose-vector-raja.cc @@ -272,10 +272,8 @@ int main(int argc, char * argv[]) /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - A.resize(order*order); - B.resize(order*order); + std::vector A(order*order); + std::vector B(order*order); if (use_for=="seq") { if (use_nested) { diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc index ef18d76a0..ee0097026 100644 --- a/Cxx11/transpose-vector-rangefor.cc +++ b/Cxx11/transpose-vector-rangefor.cc @@ -93,10 +93,8 @@ int main(int argc, char * argv[]) /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - B.resize(order*order,0.0); - A.resize(order*order); + std::vector A(order*order); + std::vector B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); diff --git a/Cxx11/transpose-vector-taskloop.cc b/Cxx11/transpose-vector-taskloop.cc index e54d87913..17dbad525 100644 --- a/Cxx11/transpose-vector-taskloop.cc +++ b/Cxx11/transpose-vector-taskloop.cc @@ -113,10 +113,8 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - A.resize(order*order); - B.resize(order*order); + std::vector A(order*order); + std::vector B(order*order); auto trans_time = 0.0; diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc index 98d6922b9..45ea4bc5b 100644 --- a/Cxx11/transpose-vector-tbb.cc +++ b/Cxx11/transpose-vector-tbb.cc @@ -111,10 +111,8 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A; - std::vector B; - A.resize(order*order); - B.resize(order*order); + std::vector A(order*order); + std::vector B(order*order); tbb::blocked_range2d range(0, order, tile_size, 0, order, tile_size); tbb::parallel_for( range, [&](decltype(range)& r) { diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc index 8ce3d79c7..57fbf11ea 100644 --- a/Cxx11/transpose-vector-thread.cc +++ b/Cxx11/transpose-vector-thread.cc @@ -123,10 +123,9 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector A; - std::vector B; - B.resize(order*order,0.0); - A.resize(order*order); + std::vector A(order*order); + std::vector B(order*order,0.0); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); diff --git a/Cxx11/transpose-vector.cc b/Cxx11/transpose-vector.cc index c6199ff40..943a6e380 100644 --- a/Cxx11/transpose-vector.cc +++ b/Cxx11/transpose-vector.cc @@ -103,10 +103,9 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A; - std::vector B; - A.resize(order*order); - B.resize(order*order,0.0); + std::vector A(order*order); + std::vector B(order*order,0.0); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); From cafd273145ba21b81c2b893e75c60bd17bda6c85 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 21 Jan 2018 12:27:39 -0800 Subject: [PATCH 040/245] no need to resize stl::vector so stop it --- Cxx11/dgemm-cblas.cc | 3 +-- Cxx11/p2p-doacross-vector-openmp.cc | 4 +--- Cxx11/p2p-innerloop-opencl.cc | 4 +--- Cxx11/p2p-innerloop-vector-tbb.cc | 4 +--- Cxx11/p2p-vector-raja.cc | 4 +--- Cxx11/p2p-vector-tbb.cc | 4 +--- Cxx11/p2p-vector.cc | 4 +--- Cxx11/stencil-vector-openmp.cc | 6 ++---- Cxx11/stencil-vector-pstl.cc | 6 ++---- Cxx11/stencil-vector-raja.cc | 6 ++---- Cxx11/stencil-vector-rangefor.cc | 6 ++---- Cxx11/stencil-vector-taskloop.cc | 6 ++---- Cxx11/stencil-vector-tbb.cc | 6 ++---- Cxx11/stencil-vector.cc | 6 ++---- 14 files changed, 21 insertions(+), 48 deletions(-) diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index 5fe2c5ab9..fc5709812 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -182,8 +182,7 @@ int main(int argc, char * argv[]) const auto epsilon = 1.0e-8; const auto forder = static_cast(order); #ifdef PRK_DEBUG - std::vector D; - D.resize(order*order,0.0); + std::vector D(order*order,0.0);; for (auto iter = 0; iter<=iterations; iter++) { prk_dgemm_loops(order, A, B, D); } diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-vector-openmp.cc index 2650c0a81..2d271c92b 100644 --- a/Cxx11/p2p-doacross-vector-openmp.cc +++ b/Cxx11/p2p-doacross-vector-openmp.cc @@ -108,9 +108,7 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - // working set - std::vector grid; - grid.resize(m*n); + std::vector grid(m*n);; OMP_PARALLEL() { diff --git a/Cxx11/p2p-innerloop-opencl.cc b/Cxx11/p2p-innerloop-opencl.cc index 620f415d3..2552fe787 100644 --- a/Cxx11/p2p-innerloop-opencl.cc +++ b/Cxx11/p2p-innerloop-opencl.cc @@ -84,9 +84,7 @@ void run(cl::Context context, int iterations, int n) /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// - const int nelems = n*n; - std::vector h_grid; - h_grid.resize(nelems, (T)0); + std::vector h_grid(n*n, T(0)); for (auto j=0; j(j); } diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc index eb17ca3e5..788226f71 100644 --- a/Cxx11/p2p-innerloop-vector-tbb.cc +++ b/Cxx11/p2p-innerloop-vector-tbb.cc @@ -112,9 +112,7 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - // working set - std::vector grid; - grid.resize(n*n,0.0); + std::vector grid(n*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j grid; - grid.resize(m*n,0.0); + std::vector grid(m*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j grid; - grid.resize(m*n,0.0); + std::vector grid(m*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j grid; - grid.resize(m*n,0.0); + std::vector grid(m*n,0.0);; { // set boundary values (bottom and left side of grid) diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-vector-openmp.cc index 8ff184d98..98343a798 100644 --- a/Cxx11/stencil-vector-openmp.cc +++ b/Cxx11/stencil-vector-openmp.cc @@ -175,10 +175,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); OMP_PARALLEL() { diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index a661ff736..863a50df5 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -176,10 +176,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); // initialize the input and output arrays auto range = boost::irange(0,n); diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc index cf2243bf6..3bcecb4ec 100644 --- a/Cxx11/stencil-vector-raja.cc +++ b/Cxx11/stencil-vector-raja.cc @@ -173,10 +173,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); #if 0 RAJA::forallN>> diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc index 2f5cf7ce1..aef3a3880 100644 --- a/Cxx11/stencil-vector-rangefor.cc +++ b/Cxx11/stencil-vector-rangefor.cc @@ -164,10 +164,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); // initialize the input and output arrays auto range = boost::irange(0,n); diff --git a/Cxx11/stencil-vector-taskloop.cc b/Cxx11/stencil-vector-taskloop.cc index 52106b9cc..971d71db1 100644 --- a/Cxx11/stencil-vector-taskloop.cc +++ b/Cxx11/stencil-vector-taskloop.cc @@ -174,10 +174,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n);; + std::vector out(n*n);; OMP_PARALLEL() OMP_MASTER diff --git a/Cxx11/stencil-vector-tbb.cc b/Cxx11/stencil-vector-tbb.cc index 76d8be67c..2f5c27488 100644 --- a/Cxx11/stencil-vector-tbb.cc +++ b/Cxx11/stencil-vector-tbb.cc @@ -169,10 +169,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); tbb::blocked_range2d range(0, n, tile_size, 0, n, tile_size); tbb::parallel_for( range, [&](decltype(range)& r) { diff --git a/Cxx11/stencil-vector.cc b/Cxx11/stencil-vector.cc index 57fcf86f6..26931780d 100644 --- a/Cxx11/stencil-vector.cc +++ b/Cxx11/stencil-vector.cc @@ -164,10 +164,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in; - std::vector out; - in.resize(n*n); - out.resize(n*n); + std::vector in(n*n); + std::vector out(n*n); { for (auto it=0; it Date: Tue, 23 Jan 2018 10:08:47 -0800 Subject: [PATCH 041/245] fix rpath usage [ci skip] --- Cxx11/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a73b87eb7..ca3144497 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -48,7 +48,7 @@ KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS) ifdef OCCADIR include ${OCCADIR}/scripts/makefile endif -OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath=${OCCADIR}/lib -L${OCCADIR}/lib -locca +OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda From 82bea6cf58b7297cefa0f5716b003e1eb0cea334 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 2 Feb 2018 14:38:27 -0800 Subject: [PATCH 042/245] try SOS 1.4.0 --- travis/install-sandia-openshmem.sh | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/travis/install-sandia-openshmem.sh b/travis/install-sandia-openshmem.sh index 8dcf9fe25..308c32d3d 100755 --- a/travis/install-sandia-openshmem.sh +++ b/travis/install-sandia-openshmem.sh @@ -13,15 +13,14 @@ TRAVIS_ROOT="$1" SHMEM_ROOT=$TRAVIS_ROOT/sandia-openshmem if [ ! -d "$SHMEM_ROOT" ]; then - # master - #git clone --depth 1 https://github.com/regrant/sandia-shmem.git sandia-shmem + # HEAD #git clone --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem - #git clone -b v1.3.2 --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem #cd sandia-shmem - # 1.3 release - wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v1.3.2.tar.gz - tar -xzf v1.3.2.tar.gz - cd SOS-1.3.2 + VERSION=1.4.0 + #git clone -b v$VERSION --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git SOS-$VERSION + wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v$VERSION.tar.gz + tar -xzf v$VERSION.tar.gz + cd SOS-$VERSION ./autogen.sh mkdir build cd build From 40034c58c04f8df5e943896cd8a08e518d6b85e3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 7 Feb 2018 06:26:21 -0800 Subject: [PATCH 043/245] PGI 17.4 Mac workaround [ci skip] --- C1z/prk_util.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/C1z/prk_util.h b/C1z/prk_util.h index 9df3a81fa..5d0831d34 100644 --- a/C1z/prk_util.h +++ b/C1z/prk_util.h @@ -46,7 +46,13 @@ #include // atoi #include // getenv #include +#if defined(__PGIC__) +typedef _Bool bool; +const bool true=1; +const bool false=0; +#else #include // bool +#endif #include #include #include // fabs @@ -81,6 +87,7 @@ # define OMP_ORDERED(x) PRAGMA(omp ordered x) # define OMP_TARGET(x) PRAGMA(omp target x) # else +# warning No OpenMP 4+ features! # define OMP_SIMD # define OMP_FOR_SIMD() PRAGMA(omp for x) # define OMP_TASK(x) From e02238199be2411d3e3019919d2e8131e8340550 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 8 Feb 2018 15:56:54 -0800 Subject: [PATCH 044/245] whitespace fix [ci skip] --- Cxx11/transpose-vector-openmp.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cxx11/transpose-vector-openmp.cc b/Cxx11/transpose-vector-openmp.cc index 4e02d09bb..ba3a26321 100644 --- a/Cxx11/transpose-vector-openmp.cc +++ b/Cxx11/transpose-vector-openmp.cc @@ -100,11 +100,11 @@ int main(int argc, char * argv[]) } #ifdef _OPENMP - std::cout << "Number of threads = " << omp_get_max_threads() << std::endl; + std::cout << "Number of threads = " << omp_get_max_threads() << std::endl; #endif - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; - std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix From e8c3d35361e508ce560c8719f12d4da543fd467d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 8 Feb 2018 16:17:41 -0800 Subject: [PATCH 045/245] fix argv location of offset in taskloop [ci skip] --- Cxx11/nstream-vector-taskloop.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc index dede73b16..d82e37d14 100644 --- a/Cxx11/nstream-vector-taskloop.cc +++ b/Cxx11/nstream-vector-taskloop.cc @@ -68,7 +68,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #ifdef _OPENMP - std::cout << "C++11/OpenMP STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "C++11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C" << std::endl; #else std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl; #endif @@ -100,7 +100,7 @@ int main(int argc, char * argv[]) throw "ERROR: grainsize"; } - offset = (argc>3) ? std::atoi(argv[3]) : 0; + offset = (argc>4) ? std::atoi(argv[4]) : 0; if (length <= 0) { throw "ERROR: offset must be nonnegative"; } From 7c6578d43e82b91d2fa668c5df43ea878a92452a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 8 Feb 2018 16:27:43 -0800 Subject: [PATCH 046/245] fix banner [ci skip] --- Cxx11/nstream-vector-taskloop.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc index d82e37d14..95bd5c925 100644 --- a/Cxx11/nstream-vector-taskloop.cc +++ b/Cxx11/nstream-vector-taskloop.cc @@ -67,11 +67,7 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; -#ifdef _OPENMP std::cout << "C++11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C" << std::endl; -#else - std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl; -#endif ////////////////////////////////////////////////////////////////////// /// Read and test input parameters From d323a4b5069c7ff66562c8e10f7616e81efceb2a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 3 Mar 2018 12:05:38 -0800 Subject: [PATCH 047/245] cleanup flags [ci skip] --- common/make.defs.gcc | 10 +++++++--- common/make.defs.intel | 3 +++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index aaae093e1..3dcbdb030 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -6,22 +6,26 @@ # VERSION=-7 # C99 is required in some implementations. -CC=gcc${VERSION} -std=c11 -pthread -Wall +CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. -FC=gfortran${VERSION} -std=f2008 -cpp -Wall +FC=gfortran${VERSION} -std=f2008 -cpp # C++11 may not be required but does no harm here. -CXX=g++${VERSION} -std=gnu++17 -pthread -Wall +CXX=g++${VERSION} -std=gnu++17 -pthread # # Compiler flags # # -mtune=native is appropriate for most cases. # -march=native is appropriate if you want portable binaries. DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math +# # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -march=knl # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. # +DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall +# # OpenMP flags # OPENMPFLAG=-fopenmp diff --git a/common/make.defs.intel b/common/make.defs.intel index 2f111cd58..087964e2c 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -20,9 +20,12 @@ CXX=icpc -std=c++14 -pthread # # -xHOST is appropriate for most cases. DEFAULT_OPT_FLAGS=-g -O3 -xHOST +# # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512 # +DEFAULT_OPT_FLAGS+=-qopt-report=5 +# # OpenMP flags # OPENMPFLAG=-qopenmp From 0131ab2c0451d3fdd4237ed74efc49b85f6ab475 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 9 Mar 2018 20:17:27 -0800 Subject: [PATCH 048/245] ignore Fortran dgemm binary [ci skip] --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 7fbf78876..ee57e8255 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -94,7 +94,7 @@ clean: -rm -f *.optrpt -rm -f *.dwarf -rm -rf *.dSYM # Mac - -rm -f p2p stencil transpose nstream + -rm -f p2p stencil transpose nstream dgemm -rm -f p2p-innerloop -rm -f *-pretty -rm -f *-coarray From 77e1f9f1c8de93a76c9e7b337f33656070df609a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 14 Mar 2018 08:10:22 -0700 Subject: [PATCH 049/245] OpenMP blocked wavefront (#317) blocked hyperplane (aka innerloop) working w/ OpenMP - rename p2p innerloop openmp to hyperplane, add to travis - add skeleton for two-level hyperplane (todo later) - performance looks pretty good: ``` ./p2p-innerloop-vector-openmp 40 4096 512 Parallel Research Kernels version 2.16 C++11/OpenMP HYPERPLANE pipeline execution on 2D grid Number of threads (max) = 4 Number of iterations = 40 Grid sizes = 4096, 4096 Grid chunk sizes = 512 Solution validates Rate (MFlops/s): 1362.17 Avg time (s): 0.0246211 ``` --- .gitignore | 2 +- Cxx11/Makefile | 6 +- ...nmp.cc => p2p-hyperplane-vector-openmp.cc} | 74 ++++++++++++++++--- common/make.defs.gcc | 6 +- travis/build-run-prk.sh | 5 +- 5 files changed, 73 insertions(+), 20 deletions(-) rename Cxx11/{p2p-innerloop-vector-openmp.cc => p2p-hyperplane-vector-openmp.cc} (73%) diff --git a/.gitignore b/.gitignore index 8ac27dc88..6e6fe1688 100644 --- a/.gitignore +++ b/.gitignore @@ -127,7 +127,7 @@ Cxx11/p2p-innerloop-openmp Cxx11/p2p-doacross-vector-openmp Cxx11/p2p-innerloop-opencl Cxx11/p2p-innerloop-vector -Cxx11/p2p-innerloop-vector-openmp +Cxx11/p2p-hyperplane-vector-openmp Cxx11/p2p-innerloop-vector-tbb Cxx11/nstream-kokkos Cxx11/nstream-opencl diff --git a/Cxx11/Makefile b/Cxx11/Makefile index ca3144497..d3cacecd2 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -63,7 +63,7 @@ endif all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa $(EXTRA) -p2p: p2p-vector p2p-doacross-vector-openmp p2p-innerloop-vector-openmp p2p-tasks-openmp p2p-openmp-target \ +p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ @@ -85,7 +85,7 @@ vector: p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream- valarray: transpose-valarray nstream-valarray -openmp: p2p-innerloop-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp +openmp: p2p-hyperplane-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp target: stencil-openmp-target transpose-openmp-target nstream-openmp-target @@ -113,7 +113,7 @@ cublas: transpose-cublas nstream-cublas occa: transpose-occa nstream-occa -p2p-innerloop-vector: p2p-innerloop-vector-openmp.cc prk_util.h +p2p-innerloop-vector: p2p-hyperplane-vector-openmp.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h diff --git a/Cxx11/p2p-innerloop-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc similarity index 73% rename from Cxx11/p2p-innerloop-vector-openmp.cc rename to Cxx11/p2p-hyperplane-vector-openmp.cc index 2d398cdb9..4a3f317ae 100644 --- a/Cxx11/p2p-innerloop-vector-openmp.cc +++ b/Cxx11/p2p-hyperplane-vector-openmp.cc @@ -61,13 +61,39 @@ #include "prk_util.h" +inline void sweep_tile_sequential(int startm, int endm, + int startn, int endn, + int n, double grid[]) +{ + for (auto i=startm; i "; + throw " <# iterations> []"; } // number of times to run the pipeline algorithm iterations = std::atoi(argv[1]); - if (iterations < 1) { + if (iterations < 0) { throw "ERROR: iterations must be >= 1"; } @@ -94,6 +120,18 @@ int main(int argc, char* argv[]) } else if ( static_cast(n)*static_cast(n) > INT_MAX) { throw "ERROR: grid dimension too large - overflow risk"; } + + // grid chunk dimensions + nc = (argc > 3) ? std::atoi(argv[3]) : 1; + nc = std::max(1,nc); + nc = std::min(n,nc); + + // number of grid blocks + nb = (n-1)/nc; + if ((n-1)%nc) nb++; + //std::cerr << "n=" << n << std::endl; + //std::cerr << "nb=" << nb << std::endl; + //std::cerr << "nc=" << nc << std::endl; } catch (const char * e) { std::cout << e << std::endl; @@ -105,6 +143,7 @@ int main(int argc, char* argv[]) #endif std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Grid sizes = " << n << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << nc << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -112,11 +151,11 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - // working set double * grid = new double[n*n]; OMP_PARALLEL() { + // TODO block this OMP_FOR_SIMD for (auto i=0; i> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-vector-openmp stencil-vector-openmp \ + make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \ transpose-vector-openmp nstream-vector-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-innerloop-vector-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 64 $PRK_TARGET_PATH/stencil-vector-openmp 10 1000 $PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 $PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 From a79f505e025babefd8e6973c39a39599a347f7fe Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 16 Mar 2018 15:27:35 -0700 Subject: [PATCH 050/245] SYCL with Travis support (#311) * SYCL Travis support Documentation/Examples - add ProGTX impl of SYCL to examples - add OpenMP flag to SYCL example (both triSYCL and ProGTX need it) Bug fix: - support all star stencils correctly - remove unnecessary whitespace in generate-sycl-stencil.py Unrelated: - git clone with depth=1 in install-pstl.sh * add missing block size to C++11 thread+async transpose * use correct binary names * add C++11 flags for SYCL * wrong scope for SYCLDIR * triSYCL requires C++14 * limit SYCL to Mac in Travis --- Cxx11/generate-sycl-stencil.py | 2 +- Cxx11/stencil-sycl.cc | 2 +- common/make.defs.gcc | 11 ++++++++--- common/make.defs.intel | 7 ++++++- common/make.defs.llvm | 7 ++++++- travis/build-run-prk.sh | 24 ++++++++++++++++++++++++ travis/install-deps.sh | 1 + travis/install-pstl.sh | 2 +- travis/install-sycl.sh | 8 ++++++++ 9 files changed, 56 insertions(+), 8 deletions(-) create mode 100644 travis/install-sycl.sh diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 8a8f44ddb..bc049c892 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -10,7 +10,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' cl::sycl::buffer d_in,\n') src.write(' cl::sycl::buffer d_out) {\n') src.write(' q.submit([&](cl::sycl::handler& h) {\n') - src.write(' auto in = d_in.get_access(h); \n') + src.write(' auto in = d_in.get_access(h);\n') src.write(' auto out = d_out.get_access(h);\n') src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') src.write(' [=] (cl::sycl::item<2> it) {\n') diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index f1ecb8abe..dba4a44af 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -227,7 +227,7 @@ int main(int argc, char* argv[]) if (iter==1) stencil_time = prk::wtime(); - star2(q, n, d_in, d_out); + stencil(q, n, d_in, d_out); q.submit([&](cl::sycl::handler& h) { diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 3e7b55235..074a1b696 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -46,9 +46,14 @@ OPENCLFLAG=-framework OpenCL # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} -SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +# ProGTX +# https://github.com/ProGTX/sycl-gtx +SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +SYCLCXX=${CXX} ${OPENMPFLAG} +SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA # diff --git a/common/make.defs.intel b/common/make.defs.intel index 087964e2c..49beeb6fa 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -44,8 +44,13 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} +SYCLCXX=${CXX} ${OPENMPFLAG} SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +# ProGTX +# https://github.com/ProGTX/sycl-gtx +SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +SYCLCXX=${CXX} ${OPENMPFLAG} +SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 40af0143b..c67e8b7db 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -66,8 +66,13 @@ SYCLFLAG+=-no-serial-memop # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... #SYCLDIR=./triSYCL -#SYCLCXX=${CXX} +#SYCLCXX=${CXX} ${OPENMPFLAG} #SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +# ProGTX +# https://github.com/ProGTX/sycl-gtx +SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +SYCLCXX=${CXX} ${OPENMPFLAG} +SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA OCCADIR=${HOME}/prk-repo/Cxx11/occa diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 55898719f..3dbe3cfe1 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -586,6 +586,30 @@ case "$PRK_TARGET" in # $PRK_TARGET_PATH/transpose-occa 10 1024 32 # $PRK_TARGET_PATH/nstream-occa 10 16777216 32 #fi + + # C++ w/ SYCL + # triSYCL requires Boost. We are having Boost issues with Travis Linux builds. + if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + SYCLDIR=${TRAVIS_ROOT}/triSYCL + if [ "${CC}" = "clang" ] ; then + # SYCL will compile without OpenMP + echo "SYCLCXX=${PRK_CXX} -pthread -std=c++14" >> common/make.defs + else + echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs + fi + echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs + make -C $PRK_TARGET_PATH stencil-sycl transpose-sycl nstream-sycl + $PRK_TARGET_PATH/stencil-sycl 10 1000 + $PRK_TARGET_PATH/transpose-sycl 10 1024 32 + $PRK_TARGET_PATH/nstream-sycl 10 16777216 32 + #echo "Test stencil code generator" + for s in star ; do # grid ; do # grid not supported yet + for r in 1 2 3 4 5 ; do + $PRK_TARGET_PATH/stencil-sycl 10 200 20 $s $r + done + done + fi + ;; allfortran) echo "Fortran" diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 42b620858..3917e2fec 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -70,6 +70,7 @@ case "$PRK_TARGET" in sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT + sh ./travis/install-sycl.sh $TRAVIS_ROOT ;; allfortran) echo "Fortran" diff --git a/travis/install-pstl.sh b/travis/install-pstl.sh index 5f68368f8..ed5aba69b 100644 --- a/travis/install-pstl.sh +++ b/travis/install-pstl.sh @@ -5,4 +5,4 @@ set -x TRAVIS_ROOT="$1" -git clone https://github.com/intel/parallelstl.git $TRAVIS_ROOT/pstl +git clone --depth 1 https://github.com/intel/parallelstl.git $TRAVIS_ROOT/pstl diff --git a/travis/install-sycl.sh b/travis/install-sycl.sh new file mode 100644 index 000000000..3ac157a3f --- /dev/null +++ b/travis/install-sycl.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +set -e +set -x + +TRAVIS_ROOT="$1" + +git clone --depth 1 https://github.com/triSYCL/triSYCL.git $TRAVIS_ROOT/triSYCL From 18b6c95b981ffb94818f11ed8169c2d209a18fa1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 16 Mar 2018 21:43:11 -0700 Subject: [PATCH 051/245] clean FORTRAN ignoring [ci skip] --- .gitignore | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 6e6fe1688..950d52c2f 100644 --- a/.gitignore +++ b/.gitignore @@ -43,17 +43,6 @@ func.c # PRK C89 stencil generated code *.output # ALCF Cobalt scheduler *.error # ALCF Cobalt scheduler -FORTRAN/Stencil/stencil -FORTRAN/Stencil/stencil-coarray -FORTRAN/Stencil/stencil-omp -FORTRAN/Stencil/stencil-pretty -FORTRAN/Synch_p2p/p2p -FORTRAN/Synch_p2p/p2p-coarray -FORTRAN/Synch_p2p/p2p-omp -FORTRAN/Transpose/transpose -FORTRAN/Transpose/transpose-coarray -FORTRAN/Transpose/transpose-omp -FORTRAN/Transpose/transpose-pretty MPI1/AMR/amr MPI1/Branch/branch MPI1/DGEMM/dgemm @@ -186,6 +175,14 @@ Cxx11/star6.cl Cxx11/star7.cl Cxx11/star8.cl Cxx11/star9.cl +FORTRAN/dgemm-taskloop-openmp +FORTRAN/dgemm-pretty +FORTRAN/dgemm-openmp +FORTRAN/dgemm +FORTRAN/nstream +FORTRAN/nstream-openmp +FORTRAN/nstream-pretty +FORTRAN/nstream-taskloop-openmp FORTRAN/p2p FORTRAN/p2p-innerloop FORTRAN/p2p-coarray From edd2d6792c0d681735aec3ca96307f5b70ff92ae Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 17 Mar 2018 07:03:18 -0700 Subject: [PATCH 052/245] CUDA stencil (#321) * make no tiling transpose explicit * add stencil-cuda to make * add CUDA stencil * fix CUDA stencil code gen for Coriander * fix issues with grid/block sizes --- Cxx11/Makefile | 5 +- Cxx11/generate-cxx-stencil.py | 11 +- Cxx11/stencil-cuda.cu | 275 ++++++++++++++++++++++++ Cxx11/stencil_cuda.hpp | 385 ++++++++++++++++++++++++++++++++++ Cxx11/transpose-cuda.cu | 23 +- 5 files changed, 691 insertions(+), 8 deletions(-) create mode 100644 Cxx11/stencil-cuda.cu create mode 100644 Cxx11/stencil_cuda.hpp diff --git a/Cxx11/Makefile b/Cxx11/Makefile index d3cacecd2..f6fed48ca 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -68,7 +68,8 @@ p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-task stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ - stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl + stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \ + stencil-cuda transpose: transpose-valarray transpose-vector transpose-vector-async transpose-vector-openmp transpose-openmp-target \ transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \ @@ -107,7 +108,7 @@ kokkos: stencil-kokkos transpose-kokkos nstream-kokkos raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja -cuda: transpose-cuda nstream-cuda +cuda: stencil-cuda transpose-cuda nstream-cuda cublas: transpose-cublas nstream-cublas diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 37e6077d3..134cd0e89 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys import fileinput @@ -69,6 +69,11 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' Kokkos::parallel_for ( Kokkos::RangePolicy('+str(radius)+',n-'+str(radius)+'), KOKKOS_LAMBDA(const int i) {\n') src.write(' PRAGMA_SIMD\n') src.write(' for (auto j='+str(radius)+'; j & in, std::vector & out) {\n') src.write(' for (auto it='+str(radius)+'; it +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" +#include "stencil_cuda.hpp" + +__global__ void nothing(const int n, const prk_float * in, prk_float * out) +{ + //printf("You are trying to use a stencil that does not exist.\n"); + //printf("Please generate the new stencil using the code generator.\n"); + // n will never be zero - this is to silence compiler warnings. + //if (n==0) printf("in=%p out=%p\n", in, out); + //abort(); +} + +__global__ void add(const int n, prk_float * in) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + dim3 dimGrid(prk::divceil(n,tile_size),prk::divceil(n,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + const size_t nelems = (size_t)n * (size_t)n; + const size_t bytes = nelems * sizeof(prk_float); + prk_float * h_in; + prk_float * h_out; +#ifndef __CORIANDERCC__ + prk::CUDA::check( cudaMallocHost((void**)&h_in, bytes) ); + prk::CUDA::check( cudaMallocHost((void**)&h_out, bytes) ); +#else + h_in = new prk_float[nelems]; + h_out = new prk_float[nelems]; +#endif + + for (auto i=0; i(i+j); + h_out[i*n+j] = static_cast(0); + } + } + + // copy input from host to device + prk_float * d_in; + prk_float * d_out; + prk::CUDA::check( cudaMalloc((void**)&d_in, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_out, bytes) ); + prk::CUDA::check( cudaMemcpy(d_in, &(h_in[0]), bytes, cudaMemcpyHostToDevice) ); + prk::CUDA::check( cudaMemcpy(d_out, &(h_out[0]), bytes, cudaMemcpyHostToDevice) ); + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + + // Apply the stencil operator + stencil<<>>(n, d_in, d_out); + + // Add constant to solution to force refresh of neighbor data, if any + add<<>>(n, d_in); + +#ifndef __CORIANDERCC__ + // silence "ignoring cudaDeviceSynchronize for now" warning + prk::CUDA::check( cudaDeviceSynchronize() ); +#endif + } + stencil_time = prk::wtime() - stencil_time; + + // copy output back to host + prk::CUDA::check( cudaMemcpy(&(h_out[0]), d_out, bytes, cudaMemcpyDeviceToHost) ); + +#ifdef VERBOSE + // copy input back to host - debug only + prk::CUDA::check( cudaMemcpy(&(h_in[0]), d_in, bytes, cudaMemcpyDeviceToHost) ); +#endif + + prk::CUDA::check( cudaFree(d_out) ); + prk::CUDA::check( cudaFree(d_in) ); + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + // compute L1 norm + double norm = 0.0; + for (auto i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/stencil_cuda.hpp b/Cxx11/stencil_cuda.hpp new file mode 100644 index 000000000..1783327fa --- /dev/null +++ b/Cxx11/stencil_cuda.hpp @@ -0,0 +1,385 @@ +__global__ void star1(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) { + out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 + +in[(i+0)*n+(j+-1)] * -0.5 + +in[(i+0)*n+(j+1)] * 0.5 + +in[(i+1)*n+(j+0)] * 0.5; + } +} + +__global__ void star2(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) { + out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 + +in[(i+-1)*n+(j+0)] * -0.25 + +in[(i+0)*n+(j+-2)] * -0.125 + +in[(i+0)*n+(j+-1)] * -0.25 + +in[(i+0)*n+(j+1)] * 0.25 + +in[(i+0)*n+(j+2)] * 0.125 + +in[(i+1)*n+(j+0)] * 0.25 + +in[(i+2)*n+(j+0)] * 0.125; + } +} + +__global__ void star3(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) { + out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 + +in[(i+-2)*n+(j+0)] * -0.0833333333333 + +in[(i+-1)*n+(j+0)] * -0.166666666667 + +in[(i+0)*n+(j+-3)] * -0.0555555555556 + +in[(i+0)*n+(j+-2)] * -0.0833333333333 + +in[(i+0)*n+(j+-1)] * -0.166666666667 + +in[(i+0)*n+(j+1)] * 0.166666666667 + +in[(i+0)*n+(j+2)] * 0.0833333333333 + +in[(i+0)*n+(j+3)] * 0.0555555555556 + +in[(i+1)*n+(j+0)] * 0.166666666667 + +in[(i+2)*n+(j+0)] * 0.0833333333333 + +in[(i+3)*n+(j+0)] * 0.0555555555556; + } +} + +__global__ void star4(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) { + out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 + +in[(i+-3)*n+(j+0)] * -0.0416666666667 + +in[(i+-2)*n+(j+0)] * -0.0625 + +in[(i+-1)*n+(j+0)] * -0.125 + +in[(i+0)*n+(j+-4)] * -0.03125 + +in[(i+0)*n+(j+-3)] * -0.0416666666667 + +in[(i+0)*n+(j+-2)] * -0.0625 + +in[(i+0)*n+(j+-1)] * -0.125 + +in[(i+0)*n+(j+1)] * 0.125 + +in[(i+0)*n+(j+2)] * 0.0625 + +in[(i+0)*n+(j+3)] * 0.0416666666667 + +in[(i+0)*n+(j+4)] * 0.03125 + +in[(i+1)*n+(j+0)] * 0.125 + +in[(i+2)*n+(j+0)] * 0.0625 + +in[(i+3)*n+(j+0)] * 0.0416666666667 + +in[(i+4)*n+(j+0)] * 0.03125; + } +} + +__global__ void star5(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) { + out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 + +in[(i+-4)*n+(j+0)] * -0.025 + +in[(i+-3)*n+(j+0)] * -0.0333333333333 + +in[(i+-2)*n+(j+0)] * -0.05 + +in[(i+-1)*n+(j+0)] * -0.1 + +in[(i+0)*n+(j+-5)] * -0.02 + +in[(i+0)*n+(j+-4)] * -0.025 + +in[(i+0)*n+(j+-3)] * -0.0333333333333 + +in[(i+0)*n+(j+-2)] * -0.05 + +in[(i+0)*n+(j+-1)] * -0.1 + +in[(i+0)*n+(j+1)] * 0.1 + +in[(i+0)*n+(j+2)] * 0.05 + +in[(i+0)*n+(j+3)] * 0.0333333333333 + +in[(i+0)*n+(j+4)] * 0.025 + +in[(i+0)*n+(j+5)] * 0.02 + +in[(i+1)*n+(j+0)] * 0.1 + +in[(i+2)*n+(j+0)] * 0.05 + +in[(i+3)*n+(j+0)] * 0.0333333333333 + +in[(i+4)*n+(j+0)] * 0.025 + +in[(i+5)*n+(j+0)] * 0.02; + } +} + +__global__ void grid1(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (1 <= i) && (i < n-1) && (1 <= j) && (j < n-1) ) { + out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 + +in[(i+-1)*n+(j+0)] * -0.25 + +in[(i+0)*n+(j+-1)] * -0.25 + +in[(i+0)*n+(j+1)] * 0.25 + +in[(i+1)*n+(j+0)] * 0.25 + +in[(i+1)*n+(j+1)] * 0.25 + ; + } +} + +__global__ void grid2(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (2 <= i) && (i < n-2) && (2 <= j) && (j < n-2) ) { + out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 + +in[(i+-2)*n+(j+-1)] * -0.0208333333333 + +in[(i+-2)*n+(j+0)] * -0.0208333333333 + +in[(i+-2)*n+(j+1)] * -0.0208333333333 + +in[(i+-1)*n+(j+-2)] * -0.0208333333333 + +in[(i+-1)*n+(j+-1)] * -0.125 + +in[(i+-1)*n+(j+0)] * -0.125 + +in[(i+-1)*n+(j+2)] * 0.0208333333333 + +in[(i+0)*n+(j+-2)] * -0.0208333333333 + +in[(i+0)*n+(j+-1)] * -0.125 + +in[(i+0)*n+(j+1)] * 0.125 + +in[(i+0)*n+(j+2)] * 0.0208333333333 + +in[(i+1)*n+(j+-2)] * -0.0208333333333 + +in[(i+1)*n+(j+0)] * 0.125 + +in[(i+1)*n+(j+1)] * 0.125 + +in[(i+1)*n+(j+2)] * 0.0208333333333 + +in[(i+2)*n+(j+-1)] * 0.0208333333333 + +in[(i+2)*n+(j+0)] * 0.0208333333333 + +in[(i+2)*n+(j+1)] * 0.0208333333333 + +in[(i+2)*n+(j+2)] * 0.0625 + ; + } +} + +__global__ void grid3(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (3 <= i) && (i < n-3) && (3 <= j) && (j < n-3) ) { + out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 + +in[(i+-3)*n+(j+-2)] * -0.00555555555556 + +in[(i+-3)*n+(j+-1)] * -0.00555555555556 + +in[(i+-3)*n+(j+0)] * -0.00555555555556 + +in[(i+-3)*n+(j+1)] * -0.00555555555556 + +in[(i+-3)*n+(j+2)] * -0.00555555555556 + +in[(i+-2)*n+(j+-3)] * -0.00555555555556 + +in[(i+-2)*n+(j+-2)] * -0.0416666666667 + +in[(i+-2)*n+(j+-1)] * -0.0138888888889 + +in[(i+-2)*n+(j+0)] * -0.0138888888889 + +in[(i+-2)*n+(j+1)] * -0.0138888888889 + +in[(i+-2)*n+(j+3)] * 0.00555555555556 + +in[(i+-1)*n+(j+-3)] * -0.00555555555556 + +in[(i+-1)*n+(j+-2)] * -0.0138888888889 + +in[(i+-1)*n+(j+-1)] * -0.0833333333333 + +in[(i+-1)*n+(j+0)] * -0.0833333333333 + +in[(i+-1)*n+(j+2)] * 0.0138888888889 + +in[(i+-1)*n+(j+3)] * 0.00555555555556 + +in[(i+0)*n+(j+-3)] * -0.00555555555556 + +in[(i+0)*n+(j+-2)] * -0.0138888888889 + +in[(i+0)*n+(j+-1)] * -0.0833333333333 + +in[(i+0)*n+(j+1)] * 0.0833333333333 + +in[(i+0)*n+(j+2)] * 0.0138888888889 + +in[(i+0)*n+(j+3)] * 0.00555555555556 + +in[(i+1)*n+(j+-3)] * -0.00555555555556 + +in[(i+1)*n+(j+-2)] * -0.0138888888889 + +in[(i+1)*n+(j+0)] * 0.0833333333333 + +in[(i+1)*n+(j+1)] * 0.0833333333333 + +in[(i+1)*n+(j+2)] * 0.0138888888889 + +in[(i+1)*n+(j+3)] * 0.00555555555556 + +in[(i+2)*n+(j+-3)] * -0.00555555555556 + +in[(i+2)*n+(j+-1)] * 0.0138888888889 + +in[(i+2)*n+(j+0)] * 0.0138888888889 + +in[(i+2)*n+(j+1)] * 0.0138888888889 + +in[(i+2)*n+(j+2)] * 0.0416666666667 + +in[(i+2)*n+(j+3)] * 0.00555555555556 + +in[(i+3)*n+(j+-2)] * 0.00555555555556 + +in[(i+3)*n+(j+-1)] * 0.00555555555556 + +in[(i+3)*n+(j+0)] * 0.00555555555556 + +in[(i+3)*n+(j+1)] * 0.00555555555556 + +in[(i+3)*n+(j+2)] * 0.00555555555556 + +in[(i+3)*n+(j+3)] * 0.0277777777778 + ; + } +} + +__global__ void grid4(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (4 <= i) && (i < n-4) && (4 <= j) && (j < n-4) ) { + out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 + +in[(i+-4)*n+(j+-3)] * -0.00223214285714 + +in[(i+-4)*n+(j+-2)] * -0.00223214285714 + +in[(i+-4)*n+(j+-1)] * -0.00223214285714 + +in[(i+-4)*n+(j+0)] * -0.00223214285714 + +in[(i+-4)*n+(j+1)] * -0.00223214285714 + +in[(i+-4)*n+(j+2)] * -0.00223214285714 + +in[(i+-4)*n+(j+3)] * -0.00223214285714 + +in[(i+-3)*n+(j+-4)] * -0.00223214285714 + +in[(i+-3)*n+(j+-3)] * -0.0208333333333 + +in[(i+-3)*n+(j+-2)] * -0.00416666666667 + +in[(i+-3)*n+(j+-1)] * -0.00416666666667 + +in[(i+-3)*n+(j+0)] * -0.00416666666667 + +in[(i+-3)*n+(j+1)] * -0.00416666666667 + +in[(i+-3)*n+(j+2)] * -0.00416666666667 + +in[(i+-3)*n+(j+4)] * 0.00223214285714 + +in[(i+-2)*n+(j+-4)] * -0.00223214285714 + +in[(i+-2)*n+(j+-3)] * -0.00416666666667 + +in[(i+-2)*n+(j+-2)] * -0.03125 + +in[(i+-2)*n+(j+-1)] * -0.0104166666667 + +in[(i+-2)*n+(j+0)] * -0.0104166666667 + +in[(i+-2)*n+(j+1)] * -0.0104166666667 + +in[(i+-2)*n+(j+3)] * 0.00416666666667 + +in[(i+-2)*n+(j+4)] * 0.00223214285714 + +in[(i+-1)*n+(j+-4)] * -0.00223214285714 + +in[(i+-1)*n+(j+-3)] * -0.00416666666667 + +in[(i+-1)*n+(j+-2)] * -0.0104166666667 + +in[(i+-1)*n+(j+-1)] * -0.0625 + +in[(i+-1)*n+(j+0)] * -0.0625 + +in[(i+-1)*n+(j+2)] * 0.0104166666667 + +in[(i+-1)*n+(j+3)] * 0.00416666666667 + +in[(i+-1)*n+(j+4)] * 0.00223214285714 + +in[(i+0)*n+(j+-4)] * -0.00223214285714 + +in[(i+0)*n+(j+-3)] * -0.00416666666667 + +in[(i+0)*n+(j+-2)] * -0.0104166666667 + +in[(i+0)*n+(j+-1)] * -0.0625 + +in[(i+0)*n+(j+1)] * 0.0625 + +in[(i+0)*n+(j+2)] * 0.0104166666667 + +in[(i+0)*n+(j+3)] * 0.00416666666667 + +in[(i+0)*n+(j+4)] * 0.00223214285714 + +in[(i+1)*n+(j+-4)] * -0.00223214285714 + +in[(i+1)*n+(j+-3)] * -0.00416666666667 + +in[(i+1)*n+(j+-2)] * -0.0104166666667 + +in[(i+1)*n+(j+0)] * 0.0625 + +in[(i+1)*n+(j+1)] * 0.0625 + +in[(i+1)*n+(j+2)] * 0.0104166666667 + +in[(i+1)*n+(j+3)] * 0.00416666666667 + +in[(i+1)*n+(j+4)] * 0.00223214285714 + +in[(i+2)*n+(j+-4)] * -0.00223214285714 + +in[(i+2)*n+(j+-3)] * -0.00416666666667 + +in[(i+2)*n+(j+-1)] * 0.0104166666667 + +in[(i+2)*n+(j+0)] * 0.0104166666667 + +in[(i+2)*n+(j+1)] * 0.0104166666667 + +in[(i+2)*n+(j+2)] * 0.03125 + +in[(i+2)*n+(j+3)] * 0.00416666666667 + +in[(i+2)*n+(j+4)] * 0.00223214285714 + +in[(i+3)*n+(j+-4)] * -0.00223214285714 + +in[(i+3)*n+(j+-2)] * 0.00416666666667 + +in[(i+3)*n+(j+-1)] * 0.00416666666667 + +in[(i+3)*n+(j+0)] * 0.00416666666667 + +in[(i+3)*n+(j+1)] * 0.00416666666667 + +in[(i+3)*n+(j+2)] * 0.00416666666667 + +in[(i+3)*n+(j+3)] * 0.0208333333333 + +in[(i+3)*n+(j+4)] * 0.00223214285714 + +in[(i+4)*n+(j+-3)] * 0.00223214285714 + +in[(i+4)*n+(j+-2)] * 0.00223214285714 + +in[(i+4)*n+(j+-1)] * 0.00223214285714 + +in[(i+4)*n+(j+0)] * 0.00223214285714 + +in[(i+4)*n+(j+1)] * 0.00223214285714 + +in[(i+4)*n+(j+2)] * 0.00223214285714 + +in[(i+4)*n+(j+3)] * 0.00223214285714 + +in[(i+4)*n+(j+4)] * 0.015625 + ; + } +} + +__global__ void grid5(const int n, const prk_float * in, prk_float * out) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int j = blockIdx.y * blockDim.y + threadIdx.y; + if ( (5 <= i) && (i < n-5) && (5 <= j) && (j < n-5) ) { + out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 + +in[(i+-5)*n+(j+-4)] * -0.00111111111111 + +in[(i+-5)*n+(j+-3)] * -0.00111111111111 + +in[(i+-5)*n+(j+-2)] * -0.00111111111111 + +in[(i+-5)*n+(j+-1)] * -0.00111111111111 + +in[(i+-5)*n+(j+0)] * -0.00111111111111 + +in[(i+-5)*n+(j+1)] * -0.00111111111111 + +in[(i+-5)*n+(j+2)] * -0.00111111111111 + +in[(i+-5)*n+(j+3)] * -0.00111111111111 + +in[(i+-5)*n+(j+4)] * -0.00111111111111 + +in[(i+-4)*n+(j+-5)] * -0.00111111111111 + +in[(i+-4)*n+(j+-4)] * -0.0125 + +in[(i+-4)*n+(j+-3)] * -0.00178571428571 + +in[(i+-4)*n+(j+-2)] * -0.00178571428571 + +in[(i+-4)*n+(j+-1)] * -0.00178571428571 + +in[(i+-4)*n+(j+0)] * -0.00178571428571 + +in[(i+-4)*n+(j+1)] * -0.00178571428571 + +in[(i+-4)*n+(j+2)] * -0.00178571428571 + +in[(i+-4)*n+(j+3)] * -0.00178571428571 + +in[(i+-4)*n+(j+5)] * 0.00111111111111 + +in[(i+-3)*n+(j+-5)] * -0.00111111111111 + +in[(i+-3)*n+(j+-4)] * -0.00178571428571 + +in[(i+-3)*n+(j+-3)] * -0.0166666666667 + +in[(i+-3)*n+(j+-2)] * -0.00333333333333 + +in[(i+-3)*n+(j+-1)] * -0.00333333333333 + +in[(i+-3)*n+(j+0)] * -0.00333333333333 + +in[(i+-3)*n+(j+1)] * -0.00333333333333 + +in[(i+-3)*n+(j+2)] * -0.00333333333333 + +in[(i+-3)*n+(j+4)] * 0.00178571428571 + +in[(i+-3)*n+(j+5)] * 0.00111111111111 + +in[(i+-2)*n+(j+-5)] * -0.00111111111111 + +in[(i+-2)*n+(j+-4)] * -0.00178571428571 + +in[(i+-2)*n+(j+-3)] * -0.00333333333333 + +in[(i+-2)*n+(j+-2)] * -0.025 + +in[(i+-2)*n+(j+-1)] * -0.00833333333333 + +in[(i+-2)*n+(j+0)] * -0.00833333333333 + +in[(i+-2)*n+(j+1)] * -0.00833333333333 + +in[(i+-2)*n+(j+3)] * 0.00333333333333 + +in[(i+-2)*n+(j+4)] * 0.00178571428571 + +in[(i+-2)*n+(j+5)] * 0.00111111111111 + +in[(i+-1)*n+(j+-5)] * -0.00111111111111 + +in[(i+-1)*n+(j+-4)] * -0.00178571428571 + +in[(i+-1)*n+(j+-3)] * -0.00333333333333 + +in[(i+-1)*n+(j+-2)] * -0.00833333333333 + +in[(i+-1)*n+(j+-1)] * -0.05 + +in[(i+-1)*n+(j+0)] * -0.05 + +in[(i+-1)*n+(j+2)] * 0.00833333333333 + +in[(i+-1)*n+(j+3)] * 0.00333333333333 + +in[(i+-1)*n+(j+4)] * 0.00178571428571 + +in[(i+-1)*n+(j+5)] * 0.00111111111111 + +in[(i+0)*n+(j+-5)] * -0.00111111111111 + +in[(i+0)*n+(j+-4)] * -0.00178571428571 + +in[(i+0)*n+(j+-3)] * -0.00333333333333 + +in[(i+0)*n+(j+-2)] * -0.00833333333333 + +in[(i+0)*n+(j+-1)] * -0.05 + +in[(i+0)*n+(j+1)] * 0.05 + +in[(i+0)*n+(j+2)] * 0.00833333333333 + +in[(i+0)*n+(j+3)] * 0.00333333333333 + +in[(i+0)*n+(j+4)] * 0.00178571428571 + +in[(i+0)*n+(j+5)] * 0.00111111111111 + +in[(i+1)*n+(j+-5)] * -0.00111111111111 + +in[(i+1)*n+(j+-4)] * -0.00178571428571 + +in[(i+1)*n+(j+-3)] * -0.00333333333333 + +in[(i+1)*n+(j+-2)] * -0.00833333333333 + +in[(i+1)*n+(j+0)] * 0.05 + +in[(i+1)*n+(j+1)] * 0.05 + +in[(i+1)*n+(j+2)] * 0.00833333333333 + +in[(i+1)*n+(j+3)] * 0.00333333333333 + +in[(i+1)*n+(j+4)] * 0.00178571428571 + +in[(i+1)*n+(j+5)] * 0.00111111111111 + +in[(i+2)*n+(j+-5)] * -0.00111111111111 + +in[(i+2)*n+(j+-4)] * -0.00178571428571 + +in[(i+2)*n+(j+-3)] * -0.00333333333333 + +in[(i+2)*n+(j+-1)] * 0.00833333333333 + +in[(i+2)*n+(j+0)] * 0.00833333333333 + +in[(i+2)*n+(j+1)] * 0.00833333333333 + +in[(i+2)*n+(j+2)] * 0.025 + +in[(i+2)*n+(j+3)] * 0.00333333333333 + +in[(i+2)*n+(j+4)] * 0.00178571428571 + +in[(i+2)*n+(j+5)] * 0.00111111111111 + +in[(i+3)*n+(j+-5)] * -0.00111111111111 + +in[(i+3)*n+(j+-4)] * -0.00178571428571 + +in[(i+3)*n+(j+-2)] * 0.00333333333333 + +in[(i+3)*n+(j+-1)] * 0.00333333333333 + +in[(i+3)*n+(j+0)] * 0.00333333333333 + +in[(i+3)*n+(j+1)] * 0.00333333333333 + +in[(i+3)*n+(j+2)] * 0.00333333333333 + +in[(i+3)*n+(j+3)] * 0.0166666666667 + +in[(i+3)*n+(j+4)] * 0.00178571428571 + +in[(i+3)*n+(j+5)] * 0.00111111111111 + +in[(i+4)*n+(j+-5)] * -0.00111111111111 + +in[(i+4)*n+(j+-3)] * 0.00178571428571 + +in[(i+4)*n+(j+-2)] * 0.00178571428571 + +in[(i+4)*n+(j+-1)] * 0.00178571428571 + +in[(i+4)*n+(j+0)] * 0.00178571428571 + +in[(i+4)*n+(j+1)] * 0.00178571428571 + +in[(i+4)*n+(j+2)] * 0.00178571428571 + +in[(i+4)*n+(j+3)] * 0.00178571428571 + +in[(i+4)*n+(j+4)] * 0.0125 + +in[(i+4)*n+(j+5)] * 0.00111111111111 + +in[(i+5)*n+(j+-4)] * 0.00111111111111 + +in[(i+5)*n+(j+-3)] * 0.00111111111111 + +in[(i+5)*n+(j+-2)] * 0.00111111111111 + +in[(i+5)*n+(j+-1)] * 0.00111111111111 + +in[(i+5)*n+(j+0)] * 0.00111111111111 + +in[(i+5)*n+(j+1)] * 0.00111111111111 + +in[(i+5)*n+(j+2)] * 0.00111111111111 + +in[(i+5)*n+(j+3)] * 0.00111111111111 + +in[(i+5)*n+(j+4)] * 0.00111111111111 + +in[(i+5)*n+(j+5)] * 0.01 + ; + } +} + diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index ddb22ca92..1efdab462 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -56,6 +56,8 @@ #include "prk_util.h" #include "prk_cuda.h" +#define TILED 0 + #if TILED // The kernel was derived from https://github.com/parallel-forall/code-samples/blob/master/series/cuda-cpp/transpose/transpose.cu, // which is the reason for the additional copyright noted above. @@ -100,7 +102,7 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// int iterations; - int order; + int order, tile_size; try { if (argc < 3) { throw "Usage: <# iterations> "; @@ -123,6 +125,14 @@ int main(int argc, char * argv[]) std::cout << "Sorry, but order (" << order << ") must be evenly divible by " << tile_dim << " or the results are going to be wrong.\n"; } +#else + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = order; + if (tile_size > order) tile_size = order; + } #endif #ifdef __CORIANDERCC__ // This has not been analyzed, but it is an empirical fact. @@ -136,15 +146,20 @@ int main(int argc, char * argv[]) return 1; } - std::cout << "Matrix order = " << order << std::endl; std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; +#if TILED + std::cout << "Tile size = " << tile_dim << std::endl; +#else + std::cout << "Tile size = " << tile_size << std::endl; +#endif #if TILED dim3 dimGrid(order/tile_dim, order/tile_dim, 1); dim3 dimBlock(tile_dim, block_rows, 1); #else - dim3 dimGrid(order, order, 1); - dim3 dimBlock(1, 1, 1); + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); #endif info.checkDims(dimBlock, dimGrid); From eef54724574fdeca7737350fbdcdc3c6061a442d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 17 Mar 2018 07:30:03 -0700 Subject: [PATCH 053/245] minor fixes in stencil sycl (#319) --- Cxx11/stencil-sycl.cc | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index dba4a44af..24569821c 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -113,12 +113,12 @@ int main(int argc, char* argv[]) ////////////////////////////////////////////////////////////////////// int iterations; - size_t n; - size_t radius = 2; + size_t n, tile_size; bool star = true; + size_t radius = 2; try { if (argc < 3) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> [ ]"; } // number of times to run the algorithm @@ -135,7 +135,6 @@ int main(int argc, char* argv[]) throw "ERROR: grid dimension too large - overflow risk"; } -#if 0 // default tile size for tiling of local transpose tile_size = 32; if (argc > 3) { @@ -143,7 +142,6 @@ int main(int argc, char* argv[]) if (tile_size <= 0) tile_size = n; if (tile_size > n) tile_size = n; } -#endif // stencil pattern if (argc > 4) { @@ -181,18 +179,18 @@ int main(int argc, char* argv[]) case 4: stencil = star4; break; case 5: stencil = star5; break; } - } else { + } +#if 0 + else { switch (radius) { - //case 1: stencil = grid1; break; - //case 2: stencil = grid2; break; - //case 3: stencil = grid3; break; - //case 4: stencil = grid4; break; - //case 5: stencil = grid5; break; + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; } } - - // SYCL device queue - cl::sycl::queue q; +#endif ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -202,6 +200,8 @@ int main(int argc, char* argv[]) std::vector h_out(n*n,0.0); + // SYCL device queue + cl::sycl::queue q; { // initialize device buffers from host buffers cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; @@ -224,7 +224,7 @@ int main(int argc, char* argv[]) q.wait(); for (auto iter = 0; iter<=iterations; iter++) { - + if (iter==1) stencil_time = prk::wtime(); stencil(q, n, d_in, d_out); @@ -234,7 +234,7 @@ int main(int argc, char* argv[]) // accessor methods auto in = d_in.get_access(h); auto out = d_out.get_access(h); - + // Add constant to solution to force refresh of neighbor data, if any h.parallel_for(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0}, [=] (cl::sycl::item<2> it) { From 892c85dd4f415563fe652f43c45d4fb9b1dd0a7f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 17 Mar 2018 17:20:23 -0700 Subject: [PATCH 054/245] Sycl stencil 1d (#320) support 1D and 2D indexing in SYCL codes Also fixed correctness issue in stencil-sycl that was only observed with triSYCL. Needed to synchronize between application of the stencil and the add() update. --- Cxx11/generate-sycl-stencil.py | 83 +++++++++---- Cxx11/stencil-sycl.cc | 85 +++++++------ Cxx11/stencil_sycl.hpp | 220 +++++++++++++++++++++++++-------- Cxx11/transpose-sycl.cc | 28 +++-- README.md | 2 + 5 files changed, 294 insertions(+), 124 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index bc049c892..e0c0cae1e 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -1,36 +1,74 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python import sys import fileinput import string import os -def codegen(src,pattern,stencil_size,radius,W,model): +def codegen(src,pattern,stencil_size,radius,W,model,dim): src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n') - src.write(' cl::sycl::buffer d_in,\n') - src.write(' cl::sycl::buffer d_out) {\n') + if (dim==2): + src.write(' cl::sycl::buffer & d_in,\n') + src.write(' cl::sycl::buffer & d_out) {\n') + else: + src.write(' cl::sycl::buffer & d_in,\n') + src.write(' cl::sycl::buffer & d_out) {\n') src.write(' q.submit([&](cl::sycl::handler& h) {\n') src.write(' auto in = d_in.get_access(h);\n') src.write(' auto out = d_out.get_access(h);\n') - src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') - src.write(' [=] (cl::sycl::item<2> it) {\n') - src.write(' cl::sycl::id<2> xy = it.get_id();\n') - for r in range(1,radius+1): - src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') - src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') - src.write(' out[xy] += ') + if (dim==2): + src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') + src.write(' [=] (cl::sycl::item<2> it) {\n') + src.write(' cl::sycl::id<2> xy = it.get_id();\n') + for r in range(1,radius+1): + src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') + src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') + src.write(' out[xy] += ') + else: + src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') + src.write(' [=] (cl::sycl::item<2> it) {\n') + # 1D indexing the slow way + #src.write(' auto i = it[0];\n') + #src.write(' auto j = it[1];\n') + #src.write(' out[i*n+j] += ') + # 1D indexing the fast way + src.write(' out[it[0]*n+it[1]] += ') if pattern == 'star': for i in range(1,radius+1): - if i > 1: - src.write('\n') - src.write(19*' ') - src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius))) - src.write('\n'+19*' ') - src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius))) - src.write('\n'+19*' ') - src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius))) - src.write('\n'+19*' ') - src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius))) + if (dim==2): + if i > 1: + src.write('\n') + src.write(19*' ') + src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('\n'+19*' ') + src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius))) + else: + # 1D indexing the slow way + #if i > 1: + # src.write('\n') + # src.write(22*' ') + #src.write('+in[i*n+(j+'+str(i)+')] * '+str(+1./(2.*i*radius))) + #src.write('\n'+22*' ') + #src.write('+in[i*n+(j-'+str(i)+')] * '+str(-1./(2.*i*radius))) + #src.write('\n'+22*' ') + #src.write('+in[(i+'+str(i)+')*n+j] * '+str(+1./(2.*i*radius))) + #src.write('\n'+22*' ') + #src.write('+in[(i-'+str(i)+')*n+j] * '+str(-1./(2.*i*radius))) + # 1D indexing the fast way + if i > 1: + src.write('\n') + src.write(30*' ') + src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * '+str(+1./(2.*i*radius))) + src.write('\n'+30*' ') + src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * '+str(-1./(2.*i*radius))) + src.write('\n'+30*' ') + src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * '+str(+1./(2.*i*radius))) + src.write('\n'+30*' ') + src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * '+str(-1./(2.*i*radius))) if i == radius: src.write(';\n') else: @@ -62,7 +100,8 @@ def instance(src,model,pattern,r): W[r+j][r+j] = +1./(4*j*r) W[r-j][r-j] = -1./(4*j*r) - codegen(src,pattern,stencil_size,r,W,model) + codegen(src,pattern,stencil_size,r,W,model,1) + codegen(src,pattern,stencil_size,r,W,model,2) def main(): for model in ['sycl']: diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 24569821c..c4bfa6ff8 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -60,10 +60,16 @@ /// ////////////////////////////////////////////////////////////////////// +#define USE_2D_INDEXING 0 + #include "prk_util.h" #include "stencil_sycl.hpp" -void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer d_in, cl::sycl::buffer d_out) +#if USE_2D_INDEXING +void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +#else +void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +#endif { std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; @@ -71,38 +77,6 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer d_ std::abort(); } -#if 0 -void star2(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer d_in, - cl::sycl::buffer d_out) -{ - q.submit([&](cl::sycl::handler& h) { - - // accessor methods - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); - - // Apply the stencil operator - h.parallel_for(cl::sycl::range<2> {n-4, n-4}, cl::sycl::id<2> {2, 2}, - [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - out[xy] += +in[xy-dx1] * -0.25 - +in[xy+dx1] * 0.25 - +in[xy-dy1] * -0.25 - +in[xy+dy1] * 0.25 - +in[xy-dx2] * -0.125 - +in[xy+dx2] * 0.125 - +in[xy-dy2] * -0.125 - +in[xy+dy2] * 0.125; - }); - }); -} -#endif - int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; @@ -198,27 +172,39 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; + std::vector h_in(n*n,0.0); std::vector h_out(n*n,0.0); // SYCL device queue cl::sycl::queue q; { // initialize device buffers from host buffers +#if USE_2D_INDEXING cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; +#else + // FIXME: if I don't initialize this buffer from host, the results are wrong. Why? + //cl::sycl::buffer d_in { cl::sycl::range<1> {n*n} }; + cl::sycl::buffer d_in { h_in.data(), h_in.size() }; + cl::sycl::buffer d_out { h_out.data(), h_out.size() }; +#endif q.submit([&](cl::sycl::handler& h) { // accessor methods auto in = d_in.get_access(h); - // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0}, - [=] (cl::sycl::item<2> it) { + h.parallel_for(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) { +#if USE_2D_INDEXING cl::sycl::id<2> xy = it.get_id(); - auto i = xy[0]; - auto j = xy[1]; + auto i = it[0]; + auto j = it[1]; in[xy] = static_cast(i+j); +#else + auto i = it[0]; + auto j = it[1]; + in[i*n+j] = static_cast(i+j); +#endif }); }); q.wait(); @@ -228,18 +214,29 @@ int main(int argc, char* argv[]) if (iter==1) stencil_time = prk::wtime(); stencil(q, n, d_in, d_out); + // This is only necessary with triSYCL + q.wait(); q.submit([&](cl::sycl::handler& h) { // accessor methods auto in = d_in.get_access(h); - auto out = d_out.get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for(cl::sycl::range<2> {n, n}, //cl::sycl::id<2> {0, 0}, + h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, [=] (cl::sycl::item<2> it) { +#if USE_2D_INDEXING cl::sycl::id<2> xy = it.get_id(); in[xy] += 1.0; +#else +#if 0 // This is noticeably slower :-( + auto i = it[0]; + auto j = it[1]; + in[i*n+j] += 1.0; +#else + in[it[0]*n+it[1]] += 1.0; +#endif +#endif }); }); q.wait(); @@ -247,6 +244,14 @@ int main(int argc, char* argv[]) stencil_time = prk::wtime() - stencil_time; } +#if 0 + for (auto i=0; i d_in, - cl::sycl::buffer d_out) { + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, + h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, + [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5 + +in[it[0]*n+(it[1]-1)] * -0.5 + +in[(it[0]+1)*n+it[1]] * 0.5 + +in[(it[0]-1)*n+it[1]] * -0.5; + }); + }); +} + +void star1(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); out[xy] += +in[xy+dx1] * 0.5 - +in[xy+dy1] * 0.5 +in[xy-dx1] * -0.5 + +in[xy+dy1] * 0.5 +in[xy-dy1] * -0.5; }); }); } void star2(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer d_in, - cl::sycl::buffer d_out) { + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, + [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25 + +in[it[0]*n+(it[1]-1)] * -0.25 + +in[(it[0]+1)*n+it[1]] * 0.25 + +in[(it[0]-1)*n+it[1]] * -0.25 + +in[it[0]*n+(it[1]+2)] * 0.125 + +in[it[0]*n+(it[1]-2)] * -0.125 + +in[(it[0]+2)*n+it[1]] * 0.125 + +in[(it[0]-2)*n+it[1]] * -0.125; + }); + }); +} + +void star2(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, + h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); @@ -31,24 +67,48 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); out[xy] += +in[xy+dx1] * 0.25 - +in[xy+dy1] * 0.25 +in[xy-dx1] * -0.25 + +in[xy+dy1] * 0.25 +in[xy-dy1] * -0.25 +in[xy+dx2] * 0.125 - +in[xy+dy2] * 0.125 +in[xy-dx2] * -0.125 + +in[xy+dy2] * 0.125 +in[xy-dy2] * -0.125; }); }); } void star3(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer d_in, - cl::sycl::buffer d_out) { + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, + [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667 + +in[it[0]*n+(it[1]-1)] * -0.166666666667 + +in[(it[0]+1)*n+it[1]] * 0.166666666667 + +in[(it[0]-1)*n+it[1]] * -0.166666666667 + +in[it[0]*n+(it[1]+2)] * 0.0833333333333 + +in[it[0]*n+(it[1]-2)] * -0.0833333333333 + +in[(it[0]+2)*n+it[1]] * 0.0833333333333 + +in[(it[0]-2)*n+it[1]] * -0.0833333333333 + +in[it[0]*n+(it[1]+3)] * 0.0555555555556 + +in[it[0]*n+(it[1]-3)] * -0.0555555555556 + +in[(it[0]+3)*n+it[1]] * 0.0555555555556 + +in[(it[0]-3)*n+it[1]] * -0.0555555555556; + }); + }); +} + +void star3(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, + h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); @@ -57,29 +117,57 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - out[xy] += +in[xy+dx1] * 0.16666666666666666 - +in[xy+dy1] * 0.16666666666666666 - +in[xy-dx1] * -0.16666666666666666 - +in[xy-dy1] * -0.16666666666666666 - +in[xy+dx2] * 0.08333333333333333 - +in[xy+dy2] * 0.08333333333333333 - +in[xy-dx2] * -0.08333333333333333 - +in[xy-dy2] * -0.08333333333333333 - +in[xy+dx3] * 0.05555555555555555 - +in[xy+dy3] * 0.05555555555555555 - +in[xy-dx3] * -0.05555555555555555 - +in[xy-dy3] * -0.05555555555555555; + out[xy] += +in[xy+dx1] * 0.166666666667 + +in[xy-dx1] * -0.166666666667 + +in[xy+dy1] * 0.166666666667 + +in[xy-dy1] * -0.166666666667 + +in[xy+dx2] * 0.0833333333333 + +in[xy-dx2] * -0.0833333333333 + +in[xy+dy2] * 0.0833333333333 + +in[xy-dy2] * -0.0833333333333 + +in[xy+dx3] * 0.0555555555556 + +in[xy-dx3] * -0.0555555555556 + +in[xy+dy3] * 0.0555555555556 + +in[xy-dy3] * -0.0555555555556; }); }); } void star4(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer d_in, - cl::sycl::buffer d_out) { + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, + h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, + [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125 + +in[it[0]*n+(it[1]-1)] * -0.125 + +in[(it[0]+1)*n+it[1]] * 0.125 + +in[(it[0]-1)*n+it[1]] * -0.125 + +in[it[0]*n+(it[1]+2)] * 0.0625 + +in[it[0]*n+(it[1]-2)] * -0.0625 + +in[(it[0]+2)*n+it[1]] * 0.0625 + +in[(it[0]-2)*n+it[1]] * -0.0625 + +in[it[0]*n+(it[1]+3)] * 0.0416666666667 + +in[it[0]*n+(it[1]-3)] * -0.0416666666667 + +in[(it[0]+3)*n+it[1]] * 0.0416666666667 + +in[(it[0]-3)*n+it[1]] * -0.0416666666667 + +in[it[0]*n+(it[1]+4)] * 0.03125 + +in[it[0]*n+(it[1]-4)] * -0.03125 + +in[(it[0]+4)*n+it[1]] * 0.03125 + +in[(it[0]-4)*n+it[1]] * -0.03125; + }); + }); +} + +void star4(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); @@ -91,32 +179,64 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); out[xy] += +in[xy+dx1] * 0.125 - +in[xy+dy1] * 0.125 +in[xy-dx1] * -0.125 + +in[xy+dy1] * 0.125 +in[xy-dy1] * -0.125 +in[xy+dx2] * 0.0625 - +in[xy+dy2] * 0.0625 +in[xy-dx2] * -0.0625 + +in[xy+dy2] * 0.0625 +in[xy-dy2] * -0.0625 - +in[xy+dx3] * 0.041666666666666664 - +in[xy+dy3] * 0.041666666666666664 - +in[xy-dx3] * -0.041666666666666664 - +in[xy-dy3] * -0.041666666666666664 + +in[xy+dx3] * 0.0416666666667 + +in[xy-dx3] * -0.0416666666667 + +in[xy+dy3] * 0.0416666666667 + +in[xy-dy3] * -0.0416666666667 +in[xy+dx4] * 0.03125 - +in[xy+dy4] * 0.03125 +in[xy-dx4] * -0.03125 + +in[xy+dy4] * 0.03125 +in[xy-dy4] * -0.03125; }); }); } void star5(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer d_in, - cl::sycl::buffer d_out) { + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { + q.submit([&](cl::sycl::handler& h) { + auto in = d_in.get_access(h); + auto out = d_out.get_access(h); + h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, + [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1 + +in[it[0]*n+(it[1]-1)] * -0.1 + +in[(it[0]+1)*n+it[1]] * 0.1 + +in[(it[0]-1)*n+it[1]] * -0.1 + +in[it[0]*n+(it[1]+2)] * 0.05 + +in[it[0]*n+(it[1]-2)] * -0.05 + +in[(it[0]+2)*n+it[1]] * 0.05 + +in[(it[0]-2)*n+it[1]] * -0.05 + +in[it[0]*n+(it[1]+3)] * 0.0333333333333 + +in[it[0]*n+(it[1]-3)] * -0.0333333333333 + +in[(it[0]+3)*n+it[1]] * 0.0333333333333 + +in[(it[0]-3)*n+it[1]] * -0.0333333333333 + +in[it[0]*n+(it[1]+4)] * 0.025 + +in[it[0]*n+(it[1]-4)] * -0.025 + +in[(it[0]+4)*n+it[1]] * 0.025 + +in[(it[0]-4)*n+it[1]] * -0.025 + +in[it[0]*n+(it[1]+5)] * 0.02 + +in[it[0]*n+(it[1]-5)] * -0.02 + +in[(it[0]+5)*n+it[1]] * 0.02 + +in[(it[0]-5)*n+it[1]] * -0.02; + }); + }); +} + +void star5(cl::sycl::queue & q, const size_t n, + cl::sycl::buffer & d_in, + cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); + auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, + h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); @@ -130,24 +250,24 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); out[xy] += +in[xy+dx1] * 0.1 - +in[xy+dy1] * 0.1 +in[xy-dx1] * -0.1 + +in[xy+dy1] * 0.1 +in[xy-dy1] * -0.1 +in[xy+dx2] * 0.05 - +in[xy+dy2] * 0.05 +in[xy-dx2] * -0.05 + +in[xy+dy2] * 0.05 +in[xy-dy2] * -0.05 - +in[xy+dx3] * 0.03333333333333333 - +in[xy+dy3] * 0.03333333333333333 - +in[xy-dx3] * -0.03333333333333333 - +in[xy-dy3] * -0.03333333333333333 + +in[xy+dx3] * 0.0333333333333 + +in[xy-dx3] * -0.0333333333333 + +in[xy+dy3] * 0.0333333333333 + +in[xy-dy3] * -0.0333333333333 +in[xy+dx4] * 0.025 - +in[xy+dy4] * 0.025 +in[xy-dx4] * -0.025 + +in[xy+dy4] * 0.025 +in[xy-dy4] * -0.025 +in[xy+dx5] * 0.02 - +in[xy+dy5] * 0.02 +in[xy-dx5] * -0.02 + +in[xy+dy5] * 0.02 +in[xy-dy5] * -0.02; }); }); diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index cbbc1a2a1..e75897e77 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -51,6 +51,8 @@ #include "prk_util.h" +#define USE_2D_INDEXING 1 + int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; @@ -89,37 +91,34 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; - // SYCL device queue - cl::sycl::queue q; - ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// + auto trans_time = 0.0; + std::vector h_A(order*order); std::vector h_B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(h_A.begin(), h_A.end(), 0.0); - auto range = boost::irange(static_cast(0),order); - - auto trans_time = 0.0; - + // SYCL device queue + cl::sycl::queue q; { // initialize device buffers from host buffers #if USE_2D_INDEXING - cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array - cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); // FIXME: does not initialize with host array + cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); + cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); #else cl::sycl::buffer d_A { h_A.data(), h_A.size() }; cl::sycl::buffer d_B { h_B.data(), h_B.size() }; #endif for (auto iter = 0; iter<=iterations; iter++) { - + if (iter==1) trans_time = prk::wtime(); - + q.submit([&](cl::sycl::handler& h) { // accessor methods @@ -129,7 +128,10 @@ int main(int argc, char * argv[]) // transpose h.parallel_for(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { #if USE_2D_INDEXING -#error 2D indexing is not implemented yet. Fix this! + cl::sycl::id<2> ij{it[0],it[1]}; + cl::sycl::id<2> ji{it[1],it[0]}; + B[ij] += A[ji]; + A[ji] += 1.0; #else B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; A[it[1] * order + it[0]] += 1.0; @@ -149,6 +151,8 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// + auto range = boost::irange(static_cast(0),order); + // TODO: replace with std::generate, std::accumulate, or similar const auto addit = (iterations+1.) * (iterations/2.); auto abserr = 0.0; diff --git a/README.md b/README.md index fb756df1d..9360b1d35 100644 --- a/README.md +++ b/README.md @@ -79,6 +79,8 @@ y = yes i = in-progress, incomplete, or incorrect +f = see footnotes + | Parallelism | p2p | stencil | transpose | nstream | sparse | dgemm | |----------------------|-----|---------|-----------|---------|--------|-------| | None | y | y | y | y | y | y | From 4e3010ba88b673593a58230a13c63ef22f543664 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 18 Mar 2018 15:18:05 -0700 Subject: [PATCH 055/245] Boost compute (#316) add Boost.Compute (just nstream for now) * valarray does work because of https://github.com/boostorg/compute/issues/758. i may fix that in Boost.Compute some day. * add Boost.Compute to Travis (Mac-only) * split Boost headers since not available in Travis Linux --- .gitignore | 2 + Cxx11/Makefile | 17 +- Cxx11/nstream-valarray-boost-compute.cc | 184 +++++++++++++++++++ Cxx11/nstream-vector-boost-compute.cc | 230 ++++++++++++++++++++++++ Cxx11/prk_util.h | 9 +- common/make.defs.cray | 2 +- common/make.defs.gcc | 2 +- common/make.defs.intel | 2 +- common/make.defs.llvm | 2 +- travis/build-run-prk.sh | 49 +++-- travis/install-boost.sh | 6 +- 11 files changed, 478 insertions(+), 27 deletions(-) create mode 100644 Cxx11/nstream-valarray-boost-compute.cc create mode 100644 Cxx11/nstream-vector-boost-compute.cc diff --git a/.gitignore b/.gitignore index 950d52c2f..96503acd3 100644 --- a/.gitignore +++ b/.gitignore @@ -129,6 +129,8 @@ Cxx11/nstream-vector-rangefor Cxx11/nstream-vector-stl Cxx11/nstream-vector-taskloop Cxx11/nstream-vector-tbb +Cxx11/nstream-valarray-boost-compute +Cxx11/nstream-vector-boost-compute Cxx11/sparse-vector Cxx11/stencil-opencl Cxx11/stencil-openmp-target diff --git a/Cxx11/Makefile b/Cxx11/Makefile index f6fed48ca..e0cbe0b6d 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -50,7 +50,8 @@ ifdef OCCADIR endif OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca -.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl rangefor kokkos raja cuda +.PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \ + rangefor kokkos raja cuda cublas sycl boost-compute EXTRA= ifeq ($(shell uname -s),Darwin) @@ -61,7 +62,7 @@ else EXTRA += target endif -all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa $(EXTRA) +all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl @@ -81,7 +82,7 @@ nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-ta dgemm: dgemm-vector dgemm-cblas -vector: p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \ +vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \ transpose-vector-async transpose-vector-thread valarray: transpose-valarray nstream-valarray @@ -114,7 +115,11 @@ cublas: transpose-cublas nstream-cublas occa: transpose-occa nstream-occa -p2p-innerloop-vector: p2p-hyperplane-vector-openmp.cc prk_util.h +boost-compute: nstream-vector-boost-compute +# busted +#nstream-valarray-boost-compute + +p2p-hyperplane-vector: p2p-hyperplane-vector-openmp.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h @@ -151,6 +156,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-rangefor: %-rangefor.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@ +%-boost-compute: %-boost-compute.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@ + %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ @@ -204,6 +212,7 @@ clean: -rm -f *-cublas -rm -f *-cblas -rm -f *-occa + -rm -f *-boost-compute -rm -f transpose-vector-async transpose-vector-thread cleancl: diff --git a/Cxx11/nstream-valarray-boost-compute.cc b/Cxx11/nstream-valarray-boost-compute.cc new file mode 100644 index 000000000..50c54846f --- /dev/null +++ b/Cxx11/nstream-valarray-boost-compute.cc @@ -0,0 +1,184 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +namespace compute = boost::compute; + +using boost::compute::_1; + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Boost.Compute STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + //compute::compute::device device = compute::compute::system::default_device(); + auto device = compute::system::default_device(); + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + std::cout << "Boost.Compute device = " << device.name() << std::endl; + + compute::context context(device); + compute::command_queue queue(context, device); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector h_A; + h_A.resize(length,0.0f); + + const float scalar(3); + + { + compute::valarray d_A(0.0f, length); + compute::valarray d_B(2.0f, length); + compute::valarray d_C(2.0f, length); + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + d_A += d_B + scalar * d_C; + } + + nstream_time = prk::wtime() - nstream_time; + + compute::copy(std::begin(d_A), std::end(d_A), h_A.begin()); + queue.finish(); + } + compute::system::finish(); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(float); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc new file mode 100644 index 000000000..de95b37a7 --- /dev/null +++ b/Cxx11/nstream-vector-boost-compute.cc @@ -0,0 +1,230 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +namespace compute = boost::compute; + +using boost::compute::_1; + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Boost.Compute STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + //compute::compute::device device = compute::compute::system::default_device(); + auto device = compute::system::default_device(); + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + std::cout << "Boost.Compute device = " << device.name() << std::endl; + + compute::context context(device); + compute::command_queue queue(context, device); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + std::vector h_A; + h_A.resize(length); + + const float scalar(3); + + { + compute::vector d_A(length, context); + compute::vector d_B(length, context); + compute::vector d_C(length, context); + + compute::fill(d_A.begin(), d_A.end(), 0, queue); + compute::fill(d_B.begin(), d_B.end(), 2, queue); + compute::fill(d_C.begin(), d_C.end(), 2, queue); + queue.finish(); + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + +#if STUPID_HACK_IMPLEMENTATION + compute::transform(d_B.begin(), d_B.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); + compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); + compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); + compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); +#else + +#if LAMBDA_MAKE_TUPLE + // Aout and Ain are necessary because A += .. does not work + auto Aout = compute::lambda::get<0>(boost::compute::_1); + auto Ain = compute::lambda::get<1>(boost::compute::_1); + auto B = compute::lambda::get<2>(boost::compute::_1); + auto C = compute::lambda::get<3>(boost::compute::_1); +#endif + + compute::for_each( + compute::make_zip_iterator( + boost::make_tuple( + d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin() + ) + ), + compute::make_zip_iterator( + boost::make_tuple( + d_A.end(), d_A.end(), d_B.end(), d_C.end() + ) + ), +#if LAMBDA_MAKE_TUPLE + // += does not work here + compute::lambda::make_tuple( + Aout = Ain + B + scalar * C + ), +#else + // += does not work here + compute::lambda::get<0>(_1) = compute::lambda::get<1>(_1) + + compute::lambda::get<2>(_1) + + compute::lambda::get<3>(_1) * scalar, +#endif + queue + ); +#endif + + queue.finish(); + } + + nstream_time = prk::wtime() - nstream_time; + + compute::copy(d_A.begin(), d_A.end(), h_A.begin(), queue); + queue.finish(); + } + compute::system::finish(); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(float); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 1c97f9ba9..0109ba684 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -180,8 +180,13 @@ const T prk_reduce(I first, I last, T init) { # endif #endif -#ifdef USE_BOOST -# include +#if defined(USE_BOOST) +# include "boost/range/irange.hpp" +#endif + +#if defined(USE_BOOST_COMPUTE) +# include "boost/compute.hpp" +# include "boost/compute/container/valarray.hpp" #endif #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800) diff --git a/common/make.defs.cray b/common/make.defs.cray index 793bcd656..aee737d77 100644 --- a/common/make.defs.cray +++ b/common/make.defs.cray @@ -25,7 +25,7 @@ ORNLACCFLAG=-h acc # Parallel STL, Boost, etc. # # NERSC: "module load boost" -BOOSTFLAG=-DUSE_BOOST -I$${BOOST_DIR}/include +BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I$${BOOST_DIR}/include # # MPI # diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 074a1b696..36732883f 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -70,7 +70,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} diff --git a/common/make.defs.intel b/common/make.defs.intel index 49beeb6fa..7ecd87ead 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -66,7 +66,7 @@ TBBFLAG=-DUSE_TBB -tbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} KOKKOSDIR=/opt/kokkos/intel KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl diff --git a/common/make.defs.llvm b/common/make.defs.llvm index c67e8b7db..817f9da7d 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -84,7 +84,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 3dbe3cfe1..25bd9feb5 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -322,11 +322,12 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/nstream-valarray 10 16777216 32 # C++11 without external parallelism - make -C $PRK_TARGET_PATH p2p-vector p2p-innerloop-vector stencil-vector transpose-vector nstream-vector \ + make -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \ dgemm-vector sparse-vector $PRK_TARGET_PATH/p2p-vector 10 1024 1024 $PRK_TARGET_PATH/p2p-vector 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-innerloop-vector 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector 10 1024 64 $PRK_TARGET_PATH/stencil-vector 10 1000 $PRK_TARGET_PATH/transpose-vector 10 1024 32 $PRK_TARGET_PATH/nstream-vector 10 16777216 32 @@ -352,19 +353,6 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32 $PRK_TARGET_PATH/transpose-vector-async 10 1024 512 32 - # C++11 with rangefor - echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs - make -C $PRK_TARGET_PATH rangefor - $PRK_TARGET_PATH/stencil-vector-rangefor 10 1000 - $PRK_TARGET_PATH/transpose-vector-rangefor 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-rangefor 10 16777216 32 - #echo "Test stencil code generator" - for s in star grid ; do - for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-vector-rangefor 10 200 20 $s $r - done - done - # C++11 with OpenMP export OMP_NUM_THREADS=2 case "$CC" in @@ -448,6 +436,25 @@ case "$PRK_TARGET" in ;; esac + # Boost.Compute found after OpenCL, and only available in Travis with MacOS. + if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + echo "BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE" >> common/make.defs + else + echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs + fi + + # C++11 with rangefor and Boost.Ranges + make -C $PRK_TARGET_PATH rangefor + $PRK_TARGET_PATH/stencil-vector-rangefor 10 1000 + $PRK_TARGET_PATH/transpose-vector-rangefor 10 1024 32 + $PRK_TARGET_PATH/nstream-vector-rangefor 10 16777216 32 + #echo "Test stencil code generator" + for s in star grid ; do + for r in 1 2 3 4 5 ; do + $PRK_TARGET_PATH/stencil-vector-rangefor 10 200 20 $s $r + done + done + # C++11 with TBB # Skip Clang because older Clang from Linux chokes on max_align_t (https://travis-ci.org/jeffhammond/PRK/jobs/243395307) if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "osx" ] ; then @@ -529,6 +536,18 @@ case "$PRK_TARGET" in cd .. fi + # Boost.Compute moved after OpenCL to reuse those flags... + + # C++11 with Boost.Compute + # Only test Mac because: + # (1) We only test OpenCL on MacOS in Travis. + # (2) Boost.Compute is not available from APT. + # If we ever address 1, we need to enable the Boost.Compute install for Linux. + if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + make -C $PRK_TARGET_PATH nstream-vector-boost-compute + $PRK_TARGET_PATH/nstream-vector-boost-compute 10 16777216 32 + fi + # C++11 with Kokkos, RAJA case "$CC" in gcc) diff --git a/travis/install-boost.sh b/travis/install-boost.sh index 578e2f9fc..4070fc414 100755 --- a/travis/install-boost.sh +++ b/travis/install-boost.sh @@ -8,12 +8,14 @@ TRAVIS_ROOT="$1" case "$os" in Darwin) - echo "Mac" brew update brew install boost || brew upgrade boost || true ;; Linux) - echo "Linux" + # We do not test Boost.Compute on Linux because of OpenCL issues... + # Boost.Compute is a header-only library + #git clone --depth 1 https://github.com/kylelutz/compute.git ${TRAVIS_ROOT}/compute + #git clone --depth 1 https://github.com/boostorg/compute.git ${TRAVIS_ROOT}/compute ;; esac From 44da14ae9f4822b0b0cc40b5fc444f1280a24866 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 18 Mar 2018 15:24:30 -0700 Subject: [PATCH 056/245] ignore new name [ci skip] --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 96503acd3..fb475624f 100644 --- a/.gitignore +++ b/.gitignore @@ -116,6 +116,7 @@ Cxx11/p2p-innerloop-openmp Cxx11/p2p-doacross-vector-openmp Cxx11/p2p-innerloop-opencl Cxx11/p2p-innerloop-vector +Cxx11/p2p-hyperplane-vector Cxx11/p2p-hyperplane-vector-openmp Cxx11/p2p-innerloop-vector-tbb Cxx11/nstream-kokkos From ff02ca0d848d2c1ee7cc277b06d346414cff0c45 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 08:51:51 -0700 Subject: [PATCH 057/245] hoist code, use more compact syntax, etc (#322) hoist code, use more compact syntax, etc * fix compiler error with older gcc/clang ``` In file included from stencil-sycl.cc:66: ./stencil_sycl.hpp:20:21: error: call to constructor of 'cl::sycl::id<2>' is ambiguous cl::sycl::id<2> dx1({1,0}); ``` * move back to c++14 (and allow c++1z warning to persist) ``` clang++-3.9 -std=c++1z -pthread -DPRKVERSION="2.16" stencil-vector-raja.cc -DUSE_RAJA -I/home/travis/build/ParRes/Kernels/PRK-deps/raja/include -L/home/travis/build/ParRes/Kernels/PRK-deps/raja/lib -lRAJA -o stencil-vector-raja In file included from stencil-vector-raja.cc:63: In file included from ./prk_util.h:217: In file included from /home/travis/build/ParRes/Kernels/PRK-deps/raja/include/RAJA/RAJA.hpp:45: In file included from /home/travis/build/ParRes/Kernels/PRK-deps/raja/include/RAJA/util/basic_mempool.hpp:38: In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/map:60: In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/bits/stl_tree.h:72: In file included from /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/bits/node_handle.h:39: /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:27: error: use of class template 'optional' requires template arguments template optional(_Tp) -> optional<_Tp>; ^ /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:451:11: note: template is declared here class optional ^ /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:40: error: expected ';' at end of declaration template optional(_Tp) -> optional<_Tp>; ^ /usr/bin/../lib/gcc/x86_64-linux-gnu/7.2.0/../../../../include/c++/7.2.0/optional:1032:41: error: cannot use arrow operator on a type template optional(_Tp) -> optional<_Tp>; ^ ``` --- Cxx11/generate-sycl-stencil.py | 50 ++++-------- Cxx11/stencil-sycl.cc | 2 +- Cxx11/stencil_sycl.hpp | 140 ++++++++++++++------------------- travis/build-run-prk.sh | 2 +- 4 files changed, 79 insertions(+), 115 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index e0c0cae1e..fcb0c49bf 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -5,28 +5,30 @@ import string import os -def codegen(src,pattern,stencil_size,radius,W,model,dim): - src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n,\n') +def codegen(src,pattern,stencil_size,radius,model,dim): + src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ') if (dim==2): - src.write(' cl::sycl::buffer & d_in,\n') - src.write(' cl::sycl::buffer & d_out) {\n') + src.write('cl::sycl::buffer & d_in, ') + src.write('cl::sycl::buffer & d_out)\n') else: - src.write(' cl::sycl::buffer & d_in,\n') - src.write(' cl::sycl::buffer & d_out) {\n') + src.write('cl::sycl::buffer & d_in, ') + src.write('cl::sycl::buffer & d_out)\n') + src.write('{\n') src.write(' q.submit([&](cl::sycl::handler& h) {\n') src.write(' auto in = d_in.get_access(h);\n') src.write(' auto out = d_out.get_access(h);\n') if (dim==2): - src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') - src.write(' [=] (cl::sycl::item<2> it) {\n') - src.write(' cl::sycl::id<2> xy = it.get_id();\n') for r in range(1,radius+1): - src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') - src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') + src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') + src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') + src.write(' h.parallel_for(') + src.write('{n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') + src.write('{'+str(radius)+','+str(radius)+'}, ') + src.write('[=] (auto it) {\n') + if (dim==2): + src.write(' cl::sycl::id<2> xy = it.get_id();\n') src.write(' out[xy] += ') else: - src.write(' h.parallel_for(cl::sycl::range<2> {n-2*'+str(radius)+',n-2*'+str(radius)+'}, cl::sycl::id<2> {'+str(radius)+','+str(radius)+'},\n') - src.write(' [=] (cl::sycl::item<2> it) {\n') # 1D indexing the slow way #src.write(' auto i = it[0];\n') #src.write(' auto j = it[1];\n') @@ -78,30 +80,12 @@ def codegen(src,pattern,stencil_size,radius,W,model,dim): src.write('}\n\n') def instance(src,model,pattern,r): - - W = [[0.0e0 for x in range(2*r+1)] for x in range(2*r+1)] if pattern == 'star': stencil_size = 4*r+1 - for i in range(1,r+1): - W[r][r+i] = +1./(2*i*r) - W[r+i][r] = +1./(2*i*r) - W[r][r-i] = -1./(2*i*r) - W[r-i][r] = -1./(2*i*r) - else: stencil_size = (2*r+1)**2 - for j in range(1,r+1): - for i in range(-j+1,j): - W[r+i][r+j] = +1./(4*j*(2*j-1)*r) - W[r+i][r-j] = -1./(4*j*(2*j-1)*r) - W[r+j][r+i] = +1./(4*j*(2*j-1)*r) - W[r-j][r+i] = -1./(4*j*(2*j-1)*r) - - W[r+j][r+j] = +1./(4*j*r) - W[r-j][r-j] = -1./(4*j*r) - - codegen(src,pattern,stencil_size,r,W,model,1) - codegen(src,pattern,stencil_size,r,W,model,2) + codegen(src,pattern,stencil_size,r,model,1) + codegen(src,pattern,stencil_size,r,model,2) def main(): for model in ['sycl']: diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index c4bfa6ff8..7aceb02c0 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -60,7 +60,7 @@ /// ////////////////////////////////////////////////////////////////////// -#define USE_2D_INDEXING 0 +#define USE_2D_INDEXING 1 #include "prk_util.h" #include "stencil_sycl.hpp" diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 18391ce41..261128675 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,11 +1,9 @@ -void star1(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, - [=] (cl::sycl::item<2> it) { + h.parallel_for({n-2,n-2}, {1,1}, [=] (auto it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5 +in[it[0]*n+(it[1]-1)] * -0.5 +in[(it[0]+1)*n+it[1]] * 0.5 @@ -14,17 +12,15 @@ void star1(cl::sycl::queue & q, const size_t n, }); } -void star1(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*1,n-2*1}, cl::sycl::id<2> {1,1}, - [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + h.parallel_for({n-2,n-2}, {1,1}, [=] (auto it) { cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); out[xy] += +in[xy+dx1] * 0.5 +in[xy-dx1] * -0.5 +in[xy+dy1] * 0.5 @@ -33,14 +29,12 @@ void star1(cl::sycl::queue & q, const size_t n, }); } -void star2(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, - [=] (cl::sycl::item<2> it) { + h.parallel_for({n-4,n-4}, {2,2}, [=] (auto it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25 +in[it[0]*n+(it[1]-1)] * -0.25 +in[(it[0]+1)*n+it[1]] * 0.25 @@ -53,19 +47,17 @@ void star2(cl::sycl::queue & q, const size_t n, }); } -void star2(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*2,n-2*2}, cl::sycl::id<2> {2,2}, - [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + h.parallel_for({n-4,n-4}, {2,2}, [=] (auto it) { cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); out[xy] += +in[xy+dx1] * 0.25 +in[xy-dx1] * -0.25 +in[xy+dy1] * 0.25 @@ -78,14 +70,12 @@ void star2(cl::sycl::queue & q, const size_t n, }); } -void star3(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, - [=] (cl::sycl::item<2> it) { + h.parallel_for({n-6,n-6}, {3,3}, [=] (auto it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667 +in[it[0]*n+(it[1]-1)] * -0.166666666667 +in[(it[0]+1)*n+it[1]] * 0.166666666667 @@ -102,21 +92,19 @@ void star3(cl::sycl::queue & q, const size_t n, }); } -void star3(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*3,n-2*3}, cl::sycl::id<2> {3,3}, - [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + h.parallel_for({n-6,n-6}, {3,3}, [=] (auto it) { cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); out[xy] += +in[xy+dx1] * 0.166666666667 +in[xy-dx1] * -0.166666666667 +in[xy+dy1] * 0.166666666667 @@ -133,14 +121,12 @@ void star3(cl::sycl::queue & q, const size_t n, }); } -void star4(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, - [=] (cl::sycl::item<2> it) { + h.parallel_for({n-8,n-8}, {4,4}, [=] (auto it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125 +in[it[0]*n+(it[1]-1)] * -0.125 +in[(it[0]+1)*n+it[1]] * 0.125 @@ -161,23 +147,21 @@ void star4(cl::sycl::queue & q, const size_t n, }); } -void star4(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*4,n-2*4}, cl::sycl::id<2> {4,4}, - [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); + cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); + h.parallel_for({n-8,n-8}, {4,4}, [=] (auto it) { cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); - cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); out[xy] += +in[xy+dx1] * 0.125 +in[xy-dx1] * -0.125 +in[xy+dy1] * 0.125 @@ -198,14 +182,12 @@ void star4(cl::sycl::queue & q, const size_t n, }); } -void star5(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, - [=] (cl::sycl::item<2> it) { + h.parallel_for({n-10,n-10}, {5,5}, [=] (auto it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1 +in[it[0]*n+(it[1]-1)] * -0.1 +in[(it[0]+1)*n+it[1]] * 0.1 @@ -230,25 +212,23 @@ void star5(cl::sycl::queue & q, const size_t n, }); } -void star5(cl::sycl::queue & q, const size_t n, - cl::sycl::buffer & d_in, - cl::sycl::buffer & d_out) { +void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +{ q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2*5,n-2*5}, cl::sycl::id<2> {5,5}, - [=] (cl::sycl::item<2> it) { + cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); + cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); + cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); + cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); + cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); + cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); + cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); + cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); + cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); + cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); + h.parallel_for({n-10,n-10}, {5,5}, [=] (auto it) { cl::sycl::id<2> xy = it.get_id(); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); - cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); - cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); - cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); out[xy] += +in[xy+dx1] * 0.1 +in[xy-dx1] * -0.1 +in[xy+dy1] * 0.1 diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 25bd9feb5..7ae2e14b4 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -314,7 +314,7 @@ case "$PRK_TARGET" in ${PRK_CXX} -v # Need to increment this for PSTL # The pthread flag is supported by GCC and Clang at least - echo "CXX=${PRK_CXX} -std=c++11 -pthread" >> common/make.defs + echo "CXX=${PRK_CXX} -std=c++14 -pthread" >> common/make.defs # C++11 without external parallelism make -C $PRK_TARGET_PATH transpose-valarray nstream-valarray From 451a1ebdadde406e64d097df7991eb3419d73de3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 08:52:42 -0700 Subject: [PATCH 058/245] cleanup Boost compute nstream (#323) - remove stupid hack implementation - remove the pedantic version that doesn't use make_tuple --- Cxx11/nstream-vector-boost-compute.cc | 53 +++++++-------------------- 1 file changed, 14 insertions(+), 39 deletions(-) diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc index de95b37a7..fec24fbbf 100644 --- a/Cxx11/nstream-vector-boost-compute.cc +++ b/Cxx11/nstream-vector-boost-compute.cc @@ -62,6 +62,8 @@ /// ////////////////////////////////////////////////////////////////////// +#define LAMBDA_MAKE_TUPLE 1 + #include "prk_util.h" namespace compute = boost::compute; @@ -104,7 +106,6 @@ int main(int argc, char * argv[]) return 1; } - //compute::compute::device device = compute::compute::system::default_device(); auto device = compute::system::default_device(); std::cout << "Number of iterations = " << iterations << std::endl; @@ -112,9 +113,6 @@ int main(int argc, char * argv[]) std::cout << "Offset = " << offset << std::endl; std::cout << "Boost.Compute device = " << device.name() << std::endl; - compute::context context(device); - compute::command_queue queue(context, device); - ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// @@ -126,6 +124,8 @@ int main(int argc, char * argv[]) const float scalar(3); + compute::context context(device); + compute::command_queue queue(context, device); { compute::vector d_A(length, context); compute::vector d_B(length, context); @@ -140,47 +140,22 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); -#if STUPID_HACK_IMPLEMENTATION - compute::transform(d_B.begin(), d_B.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); - compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); - compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); - compute::transform(d_C.begin(), d_C.end(), d_A.begin(), d_A.begin(), compute::plus(), queue); -#else - -#if LAMBDA_MAKE_TUPLE // Aout and Ain are necessary because A += .. does not work auto Aout = compute::lambda::get<0>(boost::compute::_1); auto Ain = compute::lambda::get<1>(boost::compute::_1); auto B = compute::lambda::get<2>(boost::compute::_1); auto C = compute::lambda::get<3>(boost::compute::_1); -#endif - - compute::for_each( - compute::make_zip_iterator( - boost::make_tuple( - d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin() - ) - ), - compute::make_zip_iterator( - boost::make_tuple( - d_A.end(), d_A.end(), d_B.end(), d_C.end() - ) - ), -#if LAMBDA_MAKE_TUPLE - // += does not work here - compute::lambda::make_tuple( - Aout = Ain + B + scalar * C - ), -#else - // += does not work here - compute::lambda::get<0>(_1) = compute::lambda::get<1>(_1) - + compute::lambda::get<2>(_1) - + compute::lambda::get<3>(_1) * scalar, -#endif - queue - ); -#endif + auto begin = compute::make_zip_iterator( boost::make_tuple( d_A.begin(), d_A.begin(), d_B.begin(), d_C.begin())); + auto end = compute::make_zip_iterator( boost::make_tuple( d_A.end(), d_A.end(), d_B.end(), d_C.end())); + + compute::for_each(begin, end, + compute::lambda::make_tuple + ( + Aout = Ain + B + scalar * C + ), + queue + ); queue.finish(); } From 6ef0faaae730538254d00056b68a2bc31bdf2ab4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 10:28:55 -0700 Subject: [PATCH 059/245] Update README.md [ci skip] --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9360b1d35..871ea9272 100644 --- a/README.md +++ b/README.md @@ -90,14 +90,17 @@ f = see footnotes | OpenMP target | y | y | y | y | | | | OpenCL 1.x | i | y | y | y | | | | SYCL | | y | y | y | | | +| Boost.Compute | | | | y | | | | Parallel STL | y | y | y | y | | | | TBB | i | y | y | y | | | | Kokkos | y | y | y | y | | | | RAJA | y | y | y | y | | | -| CUDA | | | y | y | | | +| CUDA | i | y | y | y | | | | CUBLAS | | | y | y | | | | CBLAS | | | | | | y | +* [SYCL](http://sycl.tech/) +* [Boost.Compute](http://boostorg.github.io/compute/) * [TBB](https://www.threadingbuildingblocks.org/) * [Kokkos](https://github.com/kokkos/kokkos) * [RAJA](https://github.com/LLNL/RAJA) From 0c3fceb0dbe22b5e31358497b9a0e214b498d9ce Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 20 Mar 2018 09:13:13 -0700 Subject: [PATCH 060/245] workaround CUDA compiler breaking intrinsics headers CUDA compiler requires disabling one of the following: 1) all x86 intrinsics 2) all Intel + AMD intrinsics 3) MWAIT + all AVX-512 intrinsics it is almost as if this sort of idiotic behavior is intentional... [ci skip] --- common/make.defs.cuda | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 07838070e..48b85710d 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -46,7 +46,27 @@ NVCC=/opt/llvm/cocl/bin/cocl #NVCC=nvcc --compiler-bindir= --gpu-architecture=sm_61 CUDAFLAGS=-g -O3 -std=c++11 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +# heavy hammer: +#CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED +# big hammers: +#CUDAFLAGS+=-D_IMMINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_FMA4INTRIN_H_INCLUDED +#CUDAFLAGS+=-D_XOPMMINTRIN_H_INCLUDED +# many tiny hammers: CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED # # MPI # From 8e17a9a3b28d4752dfe35079a1539c5576af7922 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 12:55:30 -0700 Subject: [PATCH 061/245] add p2p hyperplane in TBB --- Cxx11/Makefile | 5 +- Cxx11/p2p-hyperplane-vector-tbb.cc | 210 +++++++++++++++++++++++++++++ Cxx11/p2p-innerloop-vector-tbb.cc | 5 +- travis/build-run-prk.sh | 10 +- 4 files changed, 222 insertions(+), 8 deletions(-) create mode 100644 Cxx11/p2p-hyperplane-vector-tbb.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e0cbe0b6d..76e5d0d73 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -65,7 +65,7 @@ endif all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ - p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl + p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ @@ -97,7 +97,8 @@ opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl sycl: stencil-sycl transpose-sycl nstream-sycl -tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb +tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ + p2p-hyperplane-vector-tbb stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc new file mode 100644 index 000000000..250356319 --- /dev/null +++ b/Cxx11/p2p-hyperplane-vector-tbb.cc @@ -0,0 +1,210 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an n^2 grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +inline void sweep_tile_sequential(int startm, int endm, + int startn, int endn, + int n, std::vector & grid) +{ + for (auto i=startm; i & grid) +{ + for (auto i=2; i<=2*n-2; i++) { + for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + const auto x = i-j+1; + const auto y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + } + } +} +#endif + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/TBB HYPERPLANE pipeline execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int n, nc, nb; + try { + if (argc < 3) { + throw " <# iterations> []"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(n)*static_cast(n) > static_cast(INT_MAX)) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + nc = (argc > 3) ? std::atoi(argv[3]) : 1; + nc = std::max(1,nc); + nc = std::min(n,nc); + + // number of grid blocks + nb = (n-1)/nc; + if ((n-1)%nc) nb++; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + const char* envvar = std::getenv("TBB_NUM_THREADS"); + int num_threads = (envvar!=NULL) ? std::atoi(envvar) : tbb::task_scheduler_init::default_num_threads(); + tbb::task_scheduler_init init(num_threads); + + std::cout << "Number of threads = " << num_threads << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << n << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << nc << std::endl; + std::cout << "TBB partitioner: " << typeid(tbb_partitioner).name() << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; // silence compiler warning + + std::vector grid(n*n,0.0); + + // set boundary values (bottom and left side of grid) + for (auto j=0; j(j); + grid[j*n+0] = static_cast(j); + } + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + if (nc==1) { + for (auto i=2; i<=2*n-2; i++) { + //OMP_FOR_SIMD + //for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + tbb::parallel_for( std::max(2,i-n+2), std::min(i,n)+1, [=,&grid](int j) { + const auto x = i-j+1; + const auto y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + }); + } + } else { + for (int i=2; i<=2*(nb+1)-2; i++) { + //OMP_FOR() + //for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) { + tbb::parallel_for( std::max(2,i-(nb+1)+2), std::min(i,nb+1)+1, [=,&grid](int j) { + const int ib = nc*(i-j)+1; + const int jb = nc*(j-2)+1; + sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); + }); + } + } + grid[0*n+0] = -grid[(n-1)*n+(n-1)]; + } + + pipeline_time = prk::wtime() - pipeline_time; + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.e-8; + auto corner_val = ((iterations+1.)*(2.*n-2.)); + if ( (std::fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) { + std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc index 788226f71..2bff51f15 100644 --- a/Cxx11/p2p-innerloop-vector-tbb.cc +++ b/Cxx11/p2p-innerloop-vector-tbb.cc @@ -72,9 +72,8 @@ int main(int argc, char* argv[]) int iterations; int n; - int mc, nc; try { - if (argc < 3){ + if (argc < 3) { throw " <# iterations> "; } @@ -121,7 +120,9 @@ int main(int argc, char* argv[]) } for (auto iter = 0; iter<=iterations; iter++){ + if (iter == 1) pipeline_time = prk::wtime(); + for (auto i=2; i<=2*n-2; i++) { tbb::parallel_for( std::max(2,i-n+2), std::min(i,n)+1, [=,&grid](int j) { const auto x = i-j+2-1; diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 7ae2e14b4..05a5b49a1 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -473,10 +473,12 @@ case "$PRK_TARGET" in ;; esac make -C $PRK_TARGET_PATH stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb - #$PRK_TARGET_PATH/p2p-vector-tbb 10 1024 1024 64 64 - $PRK_TARGET_PATH/stencil-vector-tbb 10 1000 - $PRK_TARGET_PATH/transpose-vector-tbb 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-tbb 10 16777216 32 + $PRK_TARGET_PATH/p2p-innerloop-vector-tbb 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 1 + $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 32 + $PRK_TARGET_PATH/stencil-vector-tbb 10 1000 + $PRK_TARGET_PATH/transpose-vector-tbb 10 1024 32 + $PRK_TARGET_PATH/nstream-vector-tbb 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do From a946d934af89a2721746899c1ddd40cf7592cdde Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 15:14:39 -0700 Subject: [PATCH 062/245] add STL/PSTL p2p hyperplane ala TBB (and OpenMP) --- .gitignore | 3 + Cxx11/p2p-hyperplane-vector-pstl.cc | 225 ++++++++++++++++++++++++++++ Cxx11/p2p-hyperplane-vector-tbb.cc | 1 + travis/build-run-prk.sh | 22 +-- 4 files changed, 242 insertions(+), 9 deletions(-) create mode 100644 Cxx11/p2p-hyperplane-vector-pstl.cc diff --git a/.gitignore b/.gitignore index fb475624f..44d361e21 100644 --- a/.gitignore +++ b/.gitignore @@ -210,3 +210,6 @@ FORTRAN/transpose-tasks-openmp RUST/p2p RUST/stencil RUST/transpose +p2p-hyperplane-vector-stl +p2p-hyperplane-vector-pstl +p2p-hyperplane-vector-tbb diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc new file mode 100644 index 000000000..81b58d50c --- /dev/null +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -0,0 +1,225 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an n^2 grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +inline void sweep_tile_sequential(int startm, int endm, + int startn, int endn, + int n, std::vector & grid) +{ + for (auto i=startm; i & grid) +{ + for (auto i=2; i<=2*n-2; i++) { + for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + const auto x = i-j+1; + const auto y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + } + } +} +#endif + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; +#if defined(USE_PSTL) + std::cout << "C++17 PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl; +#else + std::cout << "C++11 STL HYPERPLANE pipeline execution on 2D grid" << std::endl; +#endif + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int n, nc, nb; + try { + if (argc < 3) { + throw " <# iterations> []"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(n)*static_cast(n) > static_cast(INT_MAX)) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + nc = (argc > 3) ? std::atoi(argv[3]) : 1; + nc = std::max(1,nc); + nc = std::min(n,nc); + + // number of grid blocks + nb = (n-1)/nc; + if ((n-1)%nc) nb++; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << n << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; // silence compiler warning + + std::vector grid(n*n,0.0); + + // set boundary values (bottom and left side of grid) + for (auto j=0; j(j); + grid[j*n+0] = static_cast(j); + } + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + if (nc==1) { + for (auto i=2; i<=2*n-2; i++) { + const auto begin = std::max(2,i-n+2); + const auto end = std::min(i,n)+1; + auto range = boost::irange(begin,end); +#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) + std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { +#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ + && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) + __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) { +#else + std::for_each( std::begin(range), std::end(range), [&] (auto j) { +#endif + const auto x = i-j+1; + const auto y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + }); + } + } else { + for (int i=2; i<=2*(nb+1)-2; i++) { + const auto begin = std::max(2,i-(nb+1)+2); + const auto end = std::min(i,nb+1)+1; + auto range = boost::irange(begin,end); +#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) + std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { +#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ + && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) + __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) { +#else + std::for_each( std::begin(range), std::end(range), [&] (auto j) { +#endif + const int ib = nc*(i-j)+1; + const int jb = nc*(j-2)+1; + sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); + }); + } + } + grid[0*n+0] = -grid[(n-1)*n+(n-1)]; + } + + pipeline_time = prk::wtime() - pipeline_time; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.e-8; + auto corner_val = ((iterations+1.)*(2.*n-2.)); + if ( (std::fabs(grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) { + std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc index 250356319..6c4ad9aac 100644 --- a/Cxx11/p2p-hyperplane-vector-tbb.cc +++ b/Cxx11/p2p-hyperplane-vector-tbb.cc @@ -184,6 +184,7 @@ int main(int argc, char* argv[]) } pipeline_time = prk::wtime() - pipeline_time; + ////////////////////////////////////////////////////////////////////// // Analyze and output results. ////////////////////////////////////////////////////////////////////// diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 05a5b49a1..a88fade88 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -472,7 +472,7 @@ case "$PRK_TARGET" in export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH} ;; esac - make -C $PRK_TARGET_PATH stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb + make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb $PRK_TARGET_PATH/p2p-innerloop-vector-tbb 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 32 @@ -488,10 +488,12 @@ case "$PRK_TARGET" in fi # C++11 with STL - make -C $PRK_TARGET_PATH stencil-vector-stl transpose-vector-stl nstream-vector-stl - $PRK_TARGET_PATH/stencil-vector-stl 10 1000 - $PRK_TARGET_PATH/transpose-vector-stl 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-stl 10 16777216 32 + make -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl + $PRK_TARGET_PATH/p2p-hyperplane-vector-stl 10 1024 1 + $PRK_TARGET_PATH/p2p-hyperplane-vector-stl 10 1024 32 + $PRK_TARGET_PATH/stencil-vector-stl 10 1000 + $PRK_TARGET_PATH/transpose-vector-stl 10 1024 32 + $PRK_TARGET_PATH/nstream-vector-stl 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do @@ -508,10 +510,12 @@ case "$PRK_TARGET" in else echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs fi - make -C $PRK_TARGET_PATH stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl - $PRK_TARGET_PATH/stencil-vector-pstl 10 1000 - $PRK_TARGET_PATH/transpose-vector-pstl 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-pstl 10 16777216 32 + make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl + $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl 10 1024 1 + $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl 10 1024 32 + $PRK_TARGET_PATH/stencil-vector-pstl 10 1000 + $PRK_TARGET_PATH/transpose-vector-pstl 10 1024 32 + $PRK_TARGET_PATH/nstream-vector-pstl 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do From 1b94d4c10caa8dba77dbc0462455c7fb5fe4308d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 19 Mar 2018 16:40:43 -0700 Subject: [PATCH 063/245] add p2p innerloop (unblocked hyperplane) for SYCL the performance of this is terrible with triSYCL. like 100x100 times out. may be a race condition leading to deadlock. need to debug. --- Cxx11/Makefile | 5 +- Cxx11/p2p-hyperplane-sycl.cc | 221 +++++++++++++++++++++++++++++++++++ travis/build-run-prk.sh | 9 +- 3 files changed, 229 insertions(+), 6 deletions(-) create mode 100644 Cxx11/p2p-hyperplane-sycl.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 76e5d0d73..190b44dbb 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -65,7 +65,8 @@ endif all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ - p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb + p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ + p2p-hyperplane-sycl stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ @@ -95,7 +96,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: stencil-sycl transpose-sycl nstream-sycl +sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ p2p-hyperplane-vector-tbb diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc new file mode 100644 index 000000000..58f7e7e2d --- /dev/null +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -0,0 +1,221 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an n^2 grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +inline void sweep_tile_sequential(int startm, int endm, + int startn, int endn, + int n, std::vector & grid) +{ + for (auto i=startm; i & grid) +{ + for (auto i=2; i<=2*n-2; i++) { + for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + const auto x = i-j+1; + const auto y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + } + } +} +#endif + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL HYPERPLANE pipeline execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int n; +#if 0 + int nc, nb; +#endif + try { + if (argc < 3) { + throw " <# iterations> []"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(n)*static_cast(n) > static_cast(INT_MAX)) { + throw "ERROR: grid dimension too large - overflow risk"; + } + +#if 0 + // grid chunk dimensions + nc = (argc > 3) ? std::atoi(argv[3]) : 1; + nc = std::max(1,nc); + nc = std::min(n,nc); + + // number of grid blocks + nb = (n-1)/nc; + if ((n-1)%nc) nb++; +#endif + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << n << ", " << n << std::endl; +#if 0 + std::cout << "Grid chunk sizes = " << nc << std::endl; +#endif + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; // silence compiler warning + + std::vector h_grid(n*n,0.0); + for (int j=0; j(j); + h_grid[j*n+0] = static_cast(j); + } + + cl::sycl::queue q; + { + cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + for (int i=2; i<=2*n-2; i++) { + + cl::sycl::id<1> I{unsigned(i)}; + cl::sycl::id<1> One{1}; + + q.submit([&](cl::sycl::handler& h) { + + auto grid = d_grid.get_access(h); + + unsigned begin = std::max(2,i-n+2); + unsigned end = std::min(i,n)+1; + unsigned range = end-begin; + + h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> J) { + cl::sycl::id<1> N{unsigned(n)}; + cl::sycl::id<1> X{I-J+One}; + cl::sycl::id<1> Y{J-One}; + cl::sycl::id<1> Xold{X-One}; // x-1 + cl::sycl::id<1> Yold{Y-One}; // y-1 + cl::sycl::id<1> index0{X*N+Y}; + cl::sycl::id<1> index1{Xold*N+Y}; + cl::sycl::id<1> index2{X*N+Yold}; + cl::sycl::id<1> index3{Xold*N+Yold}; + grid[index0] = grid[index1] + grid[index2] - grid[index3]; + }); + }); + q.wait(); + } + h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)]; + } + pipeline_time = prk::wtime() - pipeline_time; + } + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.e-8; + auto corner_val = ((iterations+1.)*(2.*n-2.)); + if ( (std::fabs(h_grid[(n-1)*n+(n-1)] - corner_val)/corner_val) > epsilon) { + std::cout << "ERROR: checksum " << h_grid[(n-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index a88fade88..5812e949f 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -623,10 +623,11 @@ case "$PRK_TARGET" in echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs fi echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs - make -C $PRK_TARGET_PATH stencil-sycl transpose-sycl nstream-sycl - $PRK_TARGET_PATH/stencil-sycl 10 1000 - $PRK_TARGET_PATH/transpose-sycl 10 1024 32 - $PRK_TARGET_PATH/nstream-sycl 10 16777216 32 + make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl + $PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o + $PRK_TARGET_PATH/stencil-sycl 10 1000 + $PRK_TARGET_PATH/transpose-sycl 10 1024 32 + $PRK_TARGET_PATH/nstream-sycl 10 16777216 32 #echo "Test stencil code generator" for s in star ; do # grid ; do # grid not supported yet for r in 1 2 3 4 5 ; do From 50b527ebca6189cfff0743b04685c0161c5ae082 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 20 Mar 2018 05:32:42 -0700 Subject: [PATCH 064/245] disable p2p-hyperplane-sycl --- travis/build-run-prk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 5812e949f..7e78338fd 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -624,7 +624,7 @@ case "$PRK_TARGET" in fi echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl - $PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o + #$PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o $PRK_TARGET_PATH/stencil-sycl 10 1000 $PRK_TARGET_PATH/transpose-sycl 10 1024 32 $PRK_TARGET_PATH/nstream-sycl 10 16777216 32 From 5bb8be6915357c5af9a745e84c7391976f4632d3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 20 Mar 2018 05:52:04 -0700 Subject: [PATCH 065/245] fix errors in stencil code generator --- Cxx11/generate-sycl-stencil.py | 6 +- Cxx11/stencil_sycl.hpp | 100 ++++++++++++++++----------------- 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index fcb0c49bf..1c71ff03c 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -22,9 +22,9 @@ def codegen(src,pattern,stencil_size,radius,model,dim): src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') src.write(' h.parallel_for(') - src.write('{n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') - src.write('{'+str(radius)+','+str(radius)+'}, ') - src.write('[=] (auto it) {\n') + src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') + src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') + src.write('[=] (cl::sycl::item<2> it) {\n') if (dim==2): src.write(' cl::sycl::id<2> xy = it.get_id();\n') src.write(' out[xy] += ') diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 261128675..6fbf8d9f7 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -3,7 +3,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for({n-2,n-2}, {1,1}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5 +in[it[0]*n+(it[1]-1)] * -0.5 +in[(it[0]+1)*n+it[1]] * 0.5 @@ -19,7 +19,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ auto out = d_out.get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - h.parallel_for({n-2,n-2}, {1,1}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * 0.5 +in[xy-dx1] * -0.5 @@ -34,7 +34,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for({n-4,n-4}, {2,2}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25 +in[it[0]*n+(it[1]-1)] * -0.25 +in[(it[0]+1)*n+it[1]] * 0.25 @@ -56,7 +56,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - h.parallel_for({n-4,n-4}, {2,2}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * 0.25 +in[xy-dx1] * -0.25 @@ -75,19 +75,19 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for({n-6,n-6}, {3,3}, [=] (auto it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.166666666667 - +in[it[0]*n+(it[1]-1)] * -0.166666666667 - +in[(it[0]+1)*n+it[1]] * 0.166666666667 - +in[(it[0]-1)*n+it[1]] * -0.166666666667 - +in[it[0]*n+(it[1]+2)] * 0.0833333333333 - +in[it[0]*n+(it[1]-2)] * -0.0833333333333 - +in[(it[0]+2)*n+it[1]] * 0.0833333333333 - +in[(it[0]-2)*n+it[1]] * -0.0833333333333 - +in[it[0]*n+(it[1]+3)] * 0.0555555555556 - +in[it[0]*n+(it[1]-3)] * -0.0555555555556 - +in[(it[0]+3)*n+it[1]] * 0.0555555555556 - +in[(it[0]-3)*n+it[1]] * -0.0555555555556; + h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.16666666666666666 + +in[it[0]*n+(it[1]-1)] * -0.16666666666666666 + +in[(it[0]+1)*n+it[1]] * 0.16666666666666666 + +in[(it[0]-1)*n+it[1]] * -0.16666666666666666 + +in[it[0]*n+(it[1]+2)] * 0.08333333333333333 + +in[it[0]*n+(it[1]-2)] * -0.08333333333333333 + +in[(it[0]+2)*n+it[1]] * 0.08333333333333333 + +in[(it[0]-2)*n+it[1]] * -0.08333333333333333 + +in[it[0]*n+(it[1]+3)] * 0.05555555555555555 + +in[it[0]*n+(it[1]-3)] * -0.05555555555555555 + +in[(it[0]+3)*n+it[1]] * 0.05555555555555555 + +in[(it[0]-3)*n+it[1]] * -0.05555555555555555; }); }); } @@ -103,20 +103,20 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - h.parallel_for({n-6,n-6}, {3,3}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.166666666667 - +in[xy-dx1] * -0.166666666667 - +in[xy+dy1] * 0.166666666667 - +in[xy-dy1] * -0.166666666667 - +in[xy+dx2] * 0.0833333333333 - +in[xy-dx2] * -0.0833333333333 - +in[xy+dy2] * 0.0833333333333 - +in[xy-dy2] * -0.0833333333333 - +in[xy+dx3] * 0.0555555555556 - +in[xy-dx3] * -0.0555555555556 - +in[xy+dy3] * 0.0555555555556 - +in[xy-dy3] * -0.0555555555556; + out[xy] += +in[xy+dx1] * 0.16666666666666666 + +in[xy-dx1] * -0.16666666666666666 + +in[xy+dy1] * 0.16666666666666666 + +in[xy-dy1] * -0.16666666666666666 + +in[xy+dx2] * 0.08333333333333333 + +in[xy-dx2] * -0.08333333333333333 + +in[xy+dy2] * 0.08333333333333333 + +in[xy-dy2] * -0.08333333333333333 + +in[xy+dx3] * 0.05555555555555555 + +in[xy-dx3] * -0.05555555555555555 + +in[xy+dy3] * 0.05555555555555555 + +in[xy-dy3] * -0.05555555555555555; }); }); } @@ -126,7 +126,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for({n-8,n-8}, {4,4}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125 +in[it[0]*n+(it[1]-1)] * -0.125 +in[(it[0]+1)*n+it[1]] * 0.125 @@ -135,10 +135,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, +in[it[0]*n+(it[1]-2)] * -0.0625 +in[(it[0]+2)*n+it[1]] * 0.0625 +in[(it[0]-2)*n+it[1]] * -0.0625 - +in[it[0]*n+(it[1]+3)] * 0.0416666666667 - +in[it[0]*n+(it[1]-3)] * -0.0416666666667 - +in[(it[0]+3)*n+it[1]] * 0.0416666666667 - +in[(it[0]-3)*n+it[1]] * -0.0416666666667 + +in[it[0]*n+(it[1]+3)] * 0.041666666666666664 + +in[it[0]*n+(it[1]-3)] * -0.041666666666666664 + +in[(it[0]+3)*n+it[1]] * 0.041666666666666664 + +in[(it[0]-3)*n+it[1]] * -0.041666666666666664 +in[it[0]*n+(it[1]+4)] * 0.03125 +in[it[0]*n+(it[1]-4)] * -0.03125 +in[(it[0]+4)*n+it[1]] * 0.03125 @@ -160,7 +160,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); - h.parallel_for({n-8,n-8}, {4,4}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * 0.125 +in[xy-dx1] * -0.125 @@ -170,10 +170,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ +in[xy-dx2] * -0.0625 +in[xy+dy2] * 0.0625 +in[xy-dy2] * -0.0625 - +in[xy+dx3] * 0.0416666666667 - +in[xy-dx3] * -0.0416666666667 - +in[xy+dy3] * 0.0416666666667 - +in[xy-dy3] * -0.0416666666667 + +in[xy+dx3] * 0.041666666666666664 + +in[xy-dx3] * -0.041666666666666664 + +in[xy+dy3] * 0.041666666666666664 + +in[xy-dy3] * -0.041666666666666664 +in[xy+dx4] * 0.03125 +in[xy-dx4] * -0.03125 +in[xy+dy4] * 0.03125 @@ -187,7 +187,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); - h.parallel_for({n-10,n-10}, {5,5}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1 +in[it[0]*n+(it[1]-1)] * -0.1 +in[(it[0]+1)*n+it[1]] * 0.1 @@ -196,10 +196,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, +in[it[0]*n+(it[1]-2)] * -0.05 +in[(it[0]+2)*n+it[1]] * 0.05 +in[(it[0]-2)*n+it[1]] * -0.05 - +in[it[0]*n+(it[1]+3)] * 0.0333333333333 - +in[it[0]*n+(it[1]-3)] * -0.0333333333333 - +in[(it[0]+3)*n+it[1]] * 0.0333333333333 - +in[(it[0]-3)*n+it[1]] * -0.0333333333333 + +in[it[0]*n+(it[1]+3)] * 0.03333333333333333 + +in[it[0]*n+(it[1]-3)] * -0.03333333333333333 + +in[(it[0]+3)*n+it[1]] * 0.03333333333333333 + +in[(it[0]-3)*n+it[1]] * -0.03333333333333333 +in[it[0]*n+(it[1]+4)] * 0.025 +in[it[0]*n+(it[1]-4)] * -0.025 +in[(it[0]+4)*n+it[1]] * 0.025 @@ -227,7 +227,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); - h.parallel_for({n-10,n-10}, {5,5}, [=] (auto it) { + h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * 0.1 +in[xy-dx1] * -0.1 @@ -237,10 +237,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ +in[xy-dx2] * -0.05 +in[xy+dy2] * 0.05 +in[xy-dy2] * -0.05 - +in[xy+dx3] * 0.0333333333333 - +in[xy-dx3] * -0.0333333333333 - +in[xy+dy3] * 0.0333333333333 - +in[xy-dy3] * -0.0333333333333 + +in[xy+dx3] * 0.03333333333333333 + +in[xy-dx3] * -0.03333333333333333 + +in[xy+dy3] * 0.03333333333333333 + +in[xy-dy3] * -0.03333333333333333 +in[xy+dx4] * 0.025 +in[xy-dx4] * -0.025 +in[xy+dy4] * 0.025 From d675e38a9a64129fdae52a327bfc941e58c1dba1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 20 Mar 2018 06:06:56 -0700 Subject: [PATCH 066/245] gross fix for SYCL p2p corner update wasn't coherent with parallel sweep. this is the wrong solution but is at least correct with ComputeCpp. triSYCL didn't care because it uses OpenMP host execution. --- Cxx11/p2p-hyperplane-sycl.cc | 58 ++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index 58f7e7e2d..8d6e23595 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -156,45 +156,57 @@ int main(int argc, char* argv[]) cl::sycl::queue q; { - cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; - for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) pipeline_time = prk::wtime(); - for (int i=2; i<=2*n-2; i++) { + { + cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; + + for (int i=2; i<=2*n-2; i++) { - cl::sycl::id<1> I{unsigned(i)}; - cl::sycl::id<1> One{1}; + cl::sycl::id<1> I{unsigned(i)}; + cl::sycl::id<1> One{1}; - q.submit([&](cl::sycl::handler& h) { + q.submit([&](cl::sycl::handler& h) { - auto grid = d_grid.get_access(h); + auto grid = d_grid.get_access(h); - unsigned begin = std::max(2,i-n+2); - unsigned end = std::min(i,n)+1; - unsigned range = end-begin; + unsigned begin = std::max(2,i-n+2); + unsigned end = std::min(i,n)+1; + unsigned range = end-begin; - h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> J) { - cl::sycl::id<1> N{unsigned(n)}; - cl::sycl::id<1> X{I-J+One}; - cl::sycl::id<1> Y{J-One}; - cl::sycl::id<1> Xold{X-One}; // x-1 - cl::sycl::id<1> Yold{Y-One}; // y-1 - cl::sycl::id<1> index0{X*N+Y}; - cl::sycl::id<1> index1{Xold*N+Y}; - cl::sycl::id<1> index2{X*N+Yold}; - cl::sycl::id<1> index3{Xold*N+Yold}; - grid[index0] = grid[index1] + grid[index2] - grid[index3]; + h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) { + auto J = j.get_id(); + cl::sycl::id<1> N{unsigned(n)}; + cl::sycl::id<1> X{I-J+One}; + cl::sycl::id<1> Y{J-One}; + cl::sycl::id<1> Xold{X-One}; // x-1 + cl::sycl::id<1> Yold{Y-One}; // y-1 + cl::sycl::id<1> index0{X*N+Y}; + cl::sycl::id<1> index1{Xold*N+Y}; + cl::sycl::id<1> index2{X*N+Yold}; + cl::sycl::id<1> index3{Xold*N+Yold}; + grid[index0] = grid[index1] + grid[index2] - grid[index3]; + //std::cout << "I,J=" << I[0] << "," << J[0] << "\n"; + }); }); - }); - q.wait(); + q.wait(); + } } h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)]; } pipeline_time = prk::wtime() - pipeline_time; } +#if 0 + for (int i=0; i Date: Tue, 20 Mar 2018 06:32:36 -0700 Subject: [PATCH 067/245] improve SYCL p2p quite a bit it still performs terribly but at least the design isn't trash. --- Cxx11/p2p-hyperplane-sycl.cc | 62 ++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index 8d6e23595..305f5de38 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -156,46 +156,52 @@ int main(int argc, char* argv[]) cl::sycl::queue q; { + cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; + for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) pipeline_time = prk::wtime(); - { - cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; - - for (int i=2; i<=2*n-2; i++) { + for (int i=2; i<=2*n-2; i++) { - cl::sycl::id<1> I{unsigned(i)}; - cl::sycl::id<1> One{1}; + cl::sycl::id<1> I{unsigned(i)}; + cl::sycl::id<1> One{1}; - q.submit([&](cl::sycl::handler& h) { + q.submit([&](cl::sycl::handler& h) { - auto grid = d_grid.get_access(h); + auto grid = d_grid.get_access(h); - unsigned begin = std::max(2,i-n+2); - unsigned end = std::min(i,n)+1; - unsigned range = end-begin; + unsigned begin = std::max(2,i-n+2); + unsigned end = std::min(i,n)+1; + unsigned range = end-begin; - h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) { - auto J = j.get_id(); - cl::sycl::id<1> N{unsigned(n)}; - cl::sycl::id<1> X{I-J+One}; - cl::sycl::id<1> Y{J-One}; - cl::sycl::id<1> Xold{X-One}; // x-1 - cl::sycl::id<1> Yold{Y-One}; // y-1 - cl::sycl::id<1> index0{X*N+Y}; - cl::sycl::id<1> index1{Xold*N+Y}; - cl::sycl::id<1> index2{X*N+Yold}; - cl::sycl::id<1> index3{Xold*N+Yold}; - grid[index0] = grid[index1] + grid[index2] - grid[index3]; - //std::cout << "I,J=" << I[0] << "," << J[0] << "\n"; - }); + h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) { + auto J = j.get_id(); + cl::sycl::id<1> N{unsigned(n)}; + cl::sycl::id<1> X{I-J+One}; + cl::sycl::id<1> Y{J-One}; + cl::sycl::id<1> Xold{X-One}; // x-1 + cl::sycl::id<1> Yold{Y-One}; // y-1 + cl::sycl::id<1> index0{X*N+Y}; + cl::sycl::id<1> index1{Xold*N+Y}; + cl::sycl::id<1> index2{X*N+Yold}; + cl::sycl::id<1> index3{Xold*N+Yold}; + grid[index0] = grid[index1] + grid[index2] - grid[index3]; }); - q.wait(); - } + }); + q.wait(); } - h_grid[0*n+0] = -h_grid[(n-1)*n+(n-1)]; + q.submit([&](cl::sycl::handler& h) { + + auto grid = d_grid.get_access(h); + + h.single_task([=] { + grid[0*n+0] = -grid[(n-1)*n+(n-1)]; + }); + }); + q.wait(); } + q.wait(); pipeline_time = prk::wtime() - pipeline_time; } From b0479199e6ae063c4d47dc33cfdc9a4e2e34914f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 21 Mar 2018 06:25:06 -0700 Subject: [PATCH 068/245] improve SYCL example build options [ci skip] --- common/make.defs.llvm | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 817f9da7d..2e1ab47de 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -60,8 +60,17 @@ OPENCLFLAG=-framework OpenCL SYCLDIR=/opt/sycl/latest SYCLCXX=${SYCLDIR}/bin/compute++ SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +SYCLFLAG+=-std=c++14 # This makes a huge difference in e.g. nstream... -SYCLFLAG+=-no-serial-memop +#SYCLFLAG+=-no-serial-memop +# CentOS7 and Ubuntu14 built for this +#SYCLFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +# PRK header rejects GCC4 +#SYCLFLAG+=--gcc-toolchain=/swtools/gcc/5.4.0 +# If not found automatically +#SYCLFLAG+=${OPENCLFLAG} +# NVIDIA target +#SYCLFLAG+=-sycl-target ptx64 # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... From a6d234dc305ba566f643bbba79301495b7b3c2a6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 26 Mar 2018 09:58:46 -0700 Subject: [PATCH 069/245] ORNL-ACC wavefront (C++ and Fortran) (#325) ORNL-ACC (aka OpenACC) wavefront * add ACC p2p w/ innerloop strategy * add ignoring * add attempt at OACC async (tasks) * PGI fixes * progress on OpenACC p2p * add ACC p2p in C++ * p2p with ACC in C++ * add ORNL-ACC C++ p2p to Travis --- .gitignore | 4 + Cxx11/Makefile | 8 +- Cxx11/p2p-hyperplane-vector-ornlacc.cc | 197 +++++++++++++++++++++ FORTRAN/Makefile | 2 +- FORTRAN/nstream.f90 | 30 +++- FORTRAN/p2p-async-ornlacc.f90 | 230 +++++++++++++++++++++++++ FORTRAN/p2p-innerloop-ornlacc.f90 | 191 ++++++++++++++++++++ travis/build-run-prk.sh | 5 + 8 files changed, 659 insertions(+), 8 deletions(-) create mode 100644 Cxx11/p2p-hyperplane-vector-ornlacc.cc create mode 100644 FORTRAN/p2p-async-ornlacc.f90 create mode 100644 FORTRAN/p2p-innerloop-ornlacc.f90 diff --git a/.gitignore b/.gitignore index 44d361e21..1dc8a0272 100644 --- a/.gitignore +++ b/.gitignore @@ -194,12 +194,15 @@ FORTRAN/p2p-tasks-openmp FORTRAN/p2p-doacross-openmp FORTRAN/p2p-innerloop-openmp FORTRAN/p2p-datapar-openmp +FORTRAN/p2p-innerloop-ornlacc +FORTRAN/p2p-ornlacc FORTRAN/stencil FORTRAN/stencil-coarray FORTRAN/stencil-openmp FORTRAN/stencil-openmp-target FORTRAN/stencil-pretty FORTRAN/stencil-taskloop-openmp +FORTRAN/stencil-ornlacc FORTRAN/transpose FORTRAN/transpose-coarray FORTRAN/transpose-openmp @@ -207,6 +210,7 @@ FORTRAN/transpose-openmp-target FORTRAN/transpose-pretty FORTRAN/transpose-taskloop-openmp FORTRAN/transpose-tasks-openmp +FORTRAN/transpose-ornlacc RUST/p2p RUST/stencil RUST/transpose diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 190b44dbb..5538ceabd 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -44,6 +44,7 @@ STLFLAGS = $(STLFLAG) $(BOOSTFLAGS) PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS) RAJAFLAGS = $(RAJAFLAG) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS) +ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR include ${OCCADIR}/scripts/makefile @@ -66,7 +67,7 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ - p2p-hyperplane-sycl + p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ @@ -117,6 +118,8 @@ cublas: transpose-cublas nstream-cublas occa: transpose-occa nstream-occa +ornlacc: p2p-hyperplane-vector-ornlacc + boost-compute: nstream-vector-boost-compute # busted #nstream-valarray-boost-compute @@ -181,6 +184,9 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(info PRK help: Set OCCA_CXX=$(firstword $(CXX)) to use that compiler for OKL files.) $(CXX) $(CXXFLAGS) $< $(OCCAFLAGS) -o $@ +%-ornlacc: %-ornlacc.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(ORNLACCFLAGS) -o $@ + %: %.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ diff --git a/Cxx11/p2p-hyperplane-vector-ornlacc.cc b/Cxx11/p2p-hyperplane-vector-ornlacc.cc new file mode 100644 index 000000000..eb4a092e1 --- /dev/null +++ b/Cxx11/p2p-hyperplane-vector-ornlacc.cc @@ -0,0 +1,197 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an n^2 grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/ORNL-ACC HYPERPLANE pipeline execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int n, nc, nb; + try { + if (argc < 3) { + throw " <# iterations> []"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 0) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(n)*static_cast(n) > INT_MAX) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + nc = (argc > 3) ? std::atoi(argv[3]) : 1; + nc = std::max(1,nc); + nc = std::min(n,nc); + + // number of grid blocks + nb = (n-1)/nc; + if ((n-1)%nc) nb++; + //std::cerr << "n=" << n << std::endl; + //std::cerr << "nb=" << nb << std::endl; + //std::cerr << "nc=" << nc << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << n << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; + + double * grid = new double[n*n]; + + for (int i=0; i(j); + } + for (int i=0; i(i); + } + + #pragma acc data pcopy(grid) + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + if (nc==1) { + for (int i=2; i<=2*n-2; i++) { + #pragma acc parallel loop independent + for (int j=std::max(2,i-n+2); j<=std::min(i,n); j++) { + const int x = i-j+1; + const int y = j-1; + grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; + } + } + } else { + for (int i=2; i<=2*(nb+1)-2; i++) { + #pragma acc parallel loop gang + for (int j=std::max(2,i-(nb+1)+2); j<=std::min(i,nb+1); j++) { + const int ib = nc*(i-j)+1; + const int jb = nc*(j-2)+1; + //sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); + #pragma acc loop vector + for (int i=ib; i epsilon) { + std::cout << "ERROR: checksum " << grid[(n-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (n-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index ee57e8255..898a237c4 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -59,7 +59,7 @@ coarray: p2p-coarray stencil-coarray transpose-coarray target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgemm-openmp-target -ornlacc: p2p-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc +ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc %: %.f90 $(FC) $(FCFLAGS) $< -o $@ diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90 index 5b7799e1f..63986ab54 100644 --- a/FORTRAN/nstream.f90 +++ b/FORTRAN/nstream.f90 @@ -183,13 +183,24 @@ program main #if defined(_OPENMP) !$omp do do i=1,length + A(i) = 0 + B(i) = 2 + C(i) = 2 + enddo + !$omp end do +#elif defined(PGI) + forall (i=1:length) + A(i) = 0 + B(i) = 2 + C(i) = 2 + end forall #else do concurrent (i=1:length) -#endif A(i) = 0 B(i) = 2 C(i) = 2 enddo +#endif ! need this because otherwise no barrier between initialization ! and iteration 0 (warmup), which will lead to incorrectness. @@ -211,11 +222,18 @@ program main #if defined(_OPENMP) !$omp do do i=1,length + A(i) = A(i) + B(i) + scalar * C(i) + enddo + !$omp end do +#elif defined(PGI) + forall (i=1:length) + A(i) = A(i) + B(i) + scalar * C(i) + end forall #else do concurrent (i=1:length) -#endif A(i) = A(i) + B(i) + scalar * C(i) enddo +#endif enddo ! iterations t1 = prk_get_wtime() @@ -241,16 +259,16 @@ program main ar = ar * length asum = 0 -#if defined(_OPENMP) +#if defined(_OPENMP) || defined(PGI) !$omp parallel do reduction(+:asum) do i=1,length + asum = asum + abs(A(i)) + enddo + !$omp end parallel do #else do concurrent (i=1:length) -#endif asum = asum + abs(A(i)) enddo -#ifdef _OPENMP - !$omp end parallel do #endif deallocate( C ) diff --git a/FORTRAN/p2p-async-ornlacc.f90 b/FORTRAN/p2p-async-ornlacc.f90 new file mode 100644 index 000000000..e42cbb46d --- /dev/null +++ b/FORTRAN/p2p-async-ornlacc.f90 @@ -0,0 +1,230 @@ +! +! Copyright (c) 2015, Intel Corporation +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! NAME: Pipeline +! +! PURPOSE: This program tests the efficiency with which point-to-point +! synchronization can be carried out. It does so by executing +! a pipelined algorithm on an m*n grid. The first array dimension +! is distributed among the threads (stripwise decomposition). +! +! USAGE: The program takes as input the +! dimensions of the grid, and the number of iterations on the grid +! +! +! +! The output consists of diagnostics to make sure the +! algorithm worked, and of timing statistics. +! +! FUNCTIONS CALLED: +! +! Other than standard C functions, the following +! functions are used in this program: +! +! HISTORY: - Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2016. +! ******************************************************************* + +function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +end function prk_get_wtime + +subroutine sweep_tile(startm,endm,startn,endn,m,n,grid) + use iso_fortran_env + implicit none + integer(kind=INT32), intent(in) :: m,n + integer(kind=INT32), intent(in) :: startm,endm + integer(kind=INT32), intent(in) :: startn,endn + real(kind=REAL64), intent(inout) :: grid(m,n) + integer(kind=INT32) :: i,j + !$acc kernels + do j=startn,endn + do i=startm,endm + grid(i,j) = grid(i-1,j) + grid(i,j-1) - grid(i-1,j-1) + enddo + enddo + !$acc end kernels +end subroutine + +program main + use iso_fortran_env + implicit none + real(kind=REAL64) :: prk_get_wtime + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm + integer(kind=INT32) :: m, n + real(kind=REAL64) :: corner_val ! verification value at top right corner of grid + real(kind=REAL64), allocatable :: grid(:,:) ! array holding grid values + ! runtime variables + integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: ic, mc ! ic = chunking index, mc = chunking dimension + integer(kind=INT32) :: jc, nc ! jc = chunking index, nc = chunking dimension + integer(kind=INT32) :: lic, ljc ! hold indexes of last block + real(kind=REAL64) :: t0, t1, pipeline_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a52)') 'Fortran ORNL-ACC TASKS pipeline execution on 2D grid' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a34,2a39)') 'Usage: ./synch_p2p <# iterations> ', & + ' ', & + ' ' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + + m = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') m + + n = m + if (command_argument_count().gt.2) then + call get_command_argument(3,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') n + + mc = m + call get_command_argument(4,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') mc + + nc = n + call get_command_argument(5,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') nc + endif + + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + if ((m .lt. 1).or.(n .lt. 1)) then + write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', m, n + stop 1 + endif + + ! mc=m or nc=n disables chunking in that dimension, which means + ! there is no task parallelism to exploit + !if (((mc.lt.1).or.(mc.gt.m)).or.((nc.lt.1).or.(nc.gt.n))) then + ! mc = int(m/omp_get_max_threads()) + ! nc = int(n/omp_get_max_threads()) + !endif + mc = max(1,mc) + nc = max(1,nc) + + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8,i8)') 'Grid sizes = ', m, n + write(*,'(a,i8,i8)') 'Size of chunking = ', mc, nc + + allocate( grid(m,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of grid returned ',err + stop 1 + endif + + lic = (m/mc-1) * mc + 2 + ljc = (n/nc-1) * nc + 2 + + !$acc parallel loop gang + do j=1,n + !$acc loop vector + do i=1,m + grid(i,j) = 0.0d0 + enddo + enddo + do j=1,n + grid(1,j) = real(j-1,REAL64) + enddo + do i=1,m + grid(i,1) = real(i-1,REAL64) + enddo + + !$acc data pcopy(grid) + + do k=0,iterations + + if (k.eq.1) t0 = prk_get_wtime() + + do ic=2,m,mc + do jc=2,n,nc + !$acc async(grid(ic,jc)) wait(grid(1,1)) & + !$acc& wait(grid(ic-mc,jc-nc)) wait(grid(ic-mc,jc)) & + !$acc& wait(grid(ic,jc-nc)) wait(grid(ic,jc)) + call sweep_tile(ic,min(m,ic+mc-1),jc,min(n,jc+nc-1),m,n,grid) + enddo + enddo + !$acc async(grid(1,1)) wait(grid(lic,ljc)) + grid(1,1) = -grid(m,n) + + enddo + + t1 = prk_get_wtime() + pipeline_time = t1 - t0 + + !$acc end data + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + ! verify correctness, using top right value + corner_val = real((iterations+1)*(n+m-2),REAL64); + if (abs(grid(m,n)-corner_val)/corner_val .gt. epsilon) then + write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(m,n), & + ' does not match verification value ', corner_val + stop 1 + endif + + write(*,'(a)') 'Solution validates' + avgtime = pipeline_time/iterations + write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((m-1)*(n-1),INT64)/avgtime, & + ' Avg time (s): ', avgtime + + deallocate( grid ) + +end program diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90 new file mode 100644 index 000000000..9e5ff8da7 --- /dev/null +++ b/FORTRAN/p2p-innerloop-ornlacc.f90 @@ -0,0 +1,191 @@ +! +! Copyright (c) 2015, Intel Corporation +! +! Redistribution and use in source and binary forms, with or without +! modification, are permitted provided that the following conditions +! are met: +! +! * Redistributions of source code must retain the above copyright +! notice, this list of conditions and the following disclaimer. +! * Redistributions in binary form must reproduce the above +! copyright notice, this list of conditions and the following +! disclaimer in the documentation and/or other materials provided +! with the distribution. +! * Neither the name of Intel Corporation nor the names of its +! contributors may be used to endorse or promote products +! derived from this software without specific prior written +! permission. +! +! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +! "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +! FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +! COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +! INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +! BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +! CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +! LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +! ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +! POSSIBILITY OF SUCH DAMAGE. + +!******************************************************************* +! NAME: Pipeline +! +! PURPOSE: This program tests the efficiency with which point-to-point +! synchronization can be carried out. It does so by executing +! a pipelined algorithm on an m*n grid. The first array dimension +! is distributed among the threads (stripwise decomposition). +! +! USAGE: The program takes as input the +! dimensions of the grid, and the number of iterations on the grid +! +! +! +! The output consists of diagnostics to make sure the +! algorithm worked, and of timing statistics. +! +! FUNCTIONS CALLED: +! +! Other than standard C functions, the following +! functions are used in this program: +! +! HISTORY: - Written by Rob Van der Wijngaart, February 2009. +! Converted to Fortran by Jeff Hammond, January 2016. +! ******************************************************************* + +function prk_get_wtime() result(t) + use iso_fortran_env + implicit none + real(kind=REAL64) :: t + integer(kind=INT64) :: c, r + call system_clock(count = c, count_rate = r) + t = real(c,REAL64) / real(r,REAL64) +end function prk_get_wtime + +program main + use iso_fortran_env + implicit none + real(kind=REAL64) :: prk_get_wtime + ! for argument parsing + integer :: err + integer :: arglen + character(len=32) :: argtmp + ! problem definition + integer(kind=INT32) :: iterations ! number of times to run the pipeline algorithm + integer(kind=INT32) :: n + real(kind=REAL64) :: corner_val ! verification value at top right corner of grid + real(kind=REAL64), allocatable :: grid(:,:) ! array holding grid values + ! runtime variables + integer(kind=INT32) :: i, j, k + integer(kind=INT32) :: x, y + real(kind=REAL64) :: t0, t1, pipeline_time, avgtime ! timing parameters + real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance + + ! ******************************************************************** + ! read and test input parameters + ! ******************************************************************** + + write(*,'(a25)') 'Parallel Research Kernels' + write(*,'(a55)') 'Fortran OpenACC INNERLOOP pipeline execution on 2D grid' + + if (command_argument_count().lt.2) then + write(*,'(a17,i1)') 'argument count = ', command_argument_count() + write(*,'(a34,a16)') 'Usage: ./synch_p2p <# iterations> ', & + '' + stop 1 + endif + + iterations = 1 + call get_command_argument(1,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') iterations + + n = 1 + call get_command_argument(2,argtmp,arglen,err) + if (err.eq.0) read(argtmp,'(i32)') n + + if (n .gt. 16384) then + write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n + write(*,'(a)') 'PGI 17.10 + CUDA 9.0 generates illegal address' + endif + + if (iterations .lt. 1) then + write(*,'(a,i5)') 'ERROR: iterations must be >= 1 : ', iterations + stop 1 + endif + + if (n .lt. 1) then + write(*,'(a,i5,i5)') 'ERROR: array dimensions must be >= 1 : ', n + stop 1 + endif + +#ifdef _OPENMP + write(*,'(a,i8)') 'Number of threads = ', omp_get_max_threads() +#endif + write(*,'(a,i8)') 'Number of iterations = ', iterations + write(*,'(a,i8,i8)') 'Grid sizes = ', n, n + + allocate( grid(n,n), stat=err) + if (err .ne. 0) then + write(*,'(a,i3)') 'allocation of grid returned ',err + stop 1 + endif + + do j=1,n + do i=1,n + grid(i,j) = 0.0d0 + enddo + enddo + do j=1,n + grid(1,j) = real(j-1,REAL64) + enddo + do i=1,n + grid(i,1) = real(i-1,REAL64) + enddo + + !$acc data pcopy(grid) + + do k=0,iterations + + if (k.eq.1) t0 = prk_get_wtime() + + do i=2,2*n-2 + !$acc parallel loop independent + do j=max(2,i-n+2),min(i,n) + x = i-j+2 + y = j + grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1) + enddo + enddo + !$acc kernels + grid(1,1) = -grid(n,n) + !$acc end kernels + + enddo + + t1 = prk_get_wtime() + + !$acc end data + + pipeline_time = t1 - t0 + + ! ******************************************************************** + ! ** Analyze and output results. + ! ******************************************************************** + + ! verify correctness, using top right value + corner_val = real((iterations+1)*(2*n-2),REAL64); + if (abs(grid(n,n)-corner_val)/corner_val .gt. epsilon) then + write(*,'(a,f10.2,a,f10.2)') 'ERROR: checksum ',grid(n,n), & + ' does not match verification value ', corner_val + stop 1 + endif + + write(*,'(a)') 'Solution validates' + avgtime = pipeline_time/iterations + write(*,'(a,f13.6,a,f10.6)') 'Rate (MFlop/s): ',2.d-6*real((n-1)*(n-1),REAL64)/avgtime, & + ' Avg time (s): ', avgtime + + deallocate( grid ) + +end program diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 7e78338fd..73883df11 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -384,6 +384,11 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r done done + # ORNL-ACC + echo "ORNLACCFLAG=-fopenacc" >> common/make.defs + make -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc + $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc 10 1024 64 ;; clang) # Host From ec62d470095b6021345e6a25b52729ea30253cb0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 4 Apr 2018 14:28:13 -0700 Subject: [PATCH 070/245] add CUBLAS DGEMM (#326) - add a bunch of overdue ignoring - twiddle transpose-cublas output --- .gitignore | 27 +++- Cxx11/Makefile | 4 +- Cxx11/dgemm-cublas.cu | 252 ++++++++++++++++++++++++++++++++++++++ Cxx11/transpose-cublas.cu | 4 +- 4 files changed, 280 insertions(+), 7 deletions(-) create mode 100644 Cxx11/dgemm-cublas.cu diff --git a/.gitignore b/.gitignore index 1dc8a0272..259f7766b 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,9 @@ octave-workspace # Octave crashes *.swp # Vim */*.swp */*/*.swp +*.swo # Vim +*/*.swo +*/*/*.swo *.dSYM # Mac */*.dSYM */*/*.dSYM @@ -43,6 +46,10 @@ func.c # PRK C89 stencil generated code *.output # ALCF Cobalt scheduler *.error # ALCF Cobalt scheduler +*.log +*.log2 +*.log3 + MPI1/AMR/amr MPI1/Branch/branch MPI1/DGEMM/dgemm @@ -106,6 +113,8 @@ C1z/transpose-target C1z/transpose-taskloop C1z/transpose-ispc Cxx11/dgemm-vector +Cxx11/dgemm-cblas +Cxx11/dgemm-cublas Cxx11/p2p-openmp-target Cxx11/p2p-tasks-openmp Cxx11/p2p-vector @@ -119,6 +128,10 @@ Cxx11/p2p-innerloop-vector Cxx11/p2p-hyperplane-vector Cxx11/p2p-hyperplane-vector-openmp Cxx11/p2p-innerloop-vector-tbb +Cxx11/p2p-hyperplane-vector-stl +Cxx11/p2p-hyperplane-vector-pstl +Cxx11/p2p-hyperplane-vector-tbb +Cxx11/p2p-hyperplane-sycl Cxx11/nstream-kokkos Cxx11/nstream-opencl Cxx11/nstream-valarray @@ -132,6 +145,10 @@ Cxx11/nstream-vector-taskloop Cxx11/nstream-vector-tbb Cxx11/nstream-valarray-boost-compute Cxx11/nstream-vector-boost-compute +Cxx11/nstream-cublas +Cxx11/nstream-cuda +Cxx11/nstream-openmp-target +Cxx11/nstream-sycl Cxx11/sparse-vector Cxx11/stencil-opencl Cxx11/stencil-openmp-target @@ -145,7 +162,10 @@ Cxx11/stencil-vector-rangefor Cxx11/stencil-vector-tbb Cxx11/stencil-vector-taskloop Cxx11/stencil-kokkos +Cxx11/stencil-cuda +Cxx11/stencil-sycl Cxx11/transpose-opencl +Cxx11/transpose-sycl Cxx11/transpose-openmp-target Cxx11/transpose-valarray Cxx11/transpose-vector @@ -160,6 +180,8 @@ Cxx11/transpose-vector-rangefor Cxx11/transpose-vector-tbb Cxx11/transpose-vector-taskloop Cxx11/transpose-kokkos +Cxx11/transpose-cublas +Cxx11/transpose-cuda Cxx11/grid1.cl Cxx11/grid2.cl Cxx11/grid3.cl @@ -181,11 +203,13 @@ Cxx11/star9.cl FORTRAN/dgemm-taskloop-openmp FORTRAN/dgemm-pretty FORTRAN/dgemm-openmp +FORTRAN/dgemm-openmp-target FORTRAN/dgemm FORTRAN/nstream FORTRAN/nstream-openmp FORTRAN/nstream-pretty FORTRAN/nstream-taskloop-openmp +FORTRAN/nstream-openmp-target FORTRAN/p2p FORTRAN/p2p-innerloop FORTRAN/p2p-coarray @@ -214,6 +238,3 @@ FORTRAN/transpose-ornlacc RUST/p2p RUST/stencil RUST/transpose -p2p-hyperplane-vector-stl -p2p-hyperplane-vector-pstl -p2p-hyperplane-vector-tbb diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 5538ceabd..99c2b34b8 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -82,7 +82,7 @@ nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-ta nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \ nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl -dgemm: dgemm-vector dgemm-cblas +dgemm: dgemm-vector dgemm-cblas dgemm-cublas vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \ transpose-vector-async transpose-vector-thread @@ -114,7 +114,7 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r cuda: stencil-cuda transpose-cuda nstream-cuda -cublas: transpose-cublas nstream-cublas +cublas: transpose-cublas nstream-cublas dgemm-cublas occa: transpose-occa nstream-occa diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu new file mode 100644 index 000000000..3dad895a0 --- /dev/null +++ b/Cxx11/dgemm-cublas.cu @@ -0,0 +1,252 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +__global__ void init(unsigned order, double * A, double * B, double * C) +{ + auto i = blockIdx.x * blockDim.x + threadIdx.x; + auto j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc > 3) { + input_copy = std::atoi(argv[2]); + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; + + cublasHandle_t h; + prk::CUDA::check( cublasCreate(&h) ); + + int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + + info.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double dgemm_time(0); + + const size_t nelems = (size_t)order * (size_t)order; + const size_t bytes = nelems * sizeof(double); + + // host buffers + double * h_a; + double * h_b; + double * h_c; + prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) ); + prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) ); + prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) ); + + // device buffers + double * d_a; + double * d_b; + double * d_c; + prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) ); + + if (input_copy) { + + for (int i=0; i>>(order, d_c); + + } else { + + init<<>>(order, d_a, d_b, d_c); + + } + + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) dgemm_time = prk::wtime(); + + if (input_copy) { + prk::CUDA::check( cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice) ); + prk::CUDA::check( cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice) ); + } + + double one(1); + prk::CUDA::check( cublasDgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB + order, order, order, // m, n, k + &one, // alpha + d_a, order, // A, lda + d_b, order, // B, ldb + &one, // beta + d_c, order) ); // C, ldc + prk::CUDA::check( cudaDeviceSynchronize() ); + } + dgemm_time = prk::wtime() - dgemm_time; + } + + // copy output back to host + prk::CUDA::check( cudaMemcpy(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) ); + + prk::CUDA::check( cudaFree(d_c) ); + prk::CUDA::check( cudaFree(d_b) ); + prk::CUDA::check( cudaFree(d_a) ); + + prk::CUDA::check( cudaFreeHost(h_a) ); + prk::CUDA::check( cudaFreeHost(h_b) ); + + prk::CUDA::check( cublasDestroy(h) ); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const auto epsilon = 1.0e-8; + const auto forder = static_cast(order); + const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); + const auto checksum = prk_reduce( &(h_c[0]), &(h_c[nelems]), 0.0); + const auto residuum = std::abs(checksum-reference)/reference; + + if (residuum < epsilon) { +#if VERBOSE + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#endif + std::cout << "Solution validates" << std::endl; + auto avgtime = dgemm_time/iterations; + auto nflops = 2.0 * std::pow(forder,3); + std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; + return 1; + } + + prk::CUDA::check( cudaFreeHost(h_c) ); + + return 0; +} + + diff --git a/Cxx11/transpose-cublas.cu b/Cxx11/transpose-cublas.cu index 2ec85d35b..4f265599c 100644 --- a/Cxx11/transpose-cublas.cu +++ b/Cxx11/transpose-cublas.cu @@ -93,8 +93,8 @@ int main(int argc, char * argv[]) return 1; } - std::cout << "Matrix order = " << order << std::endl; - std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; cublasHandle_t h; //prk::CUDA::check( cublasInit() ); From 509e765bb964984bc9e40b8fbe87a3fcec4a8ecc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 4 Apr 2018 14:30:17 -0700 Subject: [PATCH 071/245] bugfix input parse arg 3 --- Cxx11/dgemm-cublas.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/dgemm-cublas.cu b/Cxx11/dgemm-cublas.cu index 3dad895a0..08bcf80c7 100644 --- a/Cxx11/dgemm-cublas.cu +++ b/Cxx11/dgemm-cublas.cu @@ -116,7 +116,7 @@ int main(int argc, char * argv[]) } if (argc > 3) { - input_copy = std::atoi(argv[2]); + input_copy = std::atoi(argv[3]); } } catch (const char * e) { From 803217dd36c03751110997be80b70adf511bf215 Mon Sep 17 00:00:00 2001 From: caizixian Date: Tue, 10 Apr 2018 03:37:50 +1000 Subject: [PATCH 072/245] Fix misleading comments (#327) --- OPENMP/Transpose/transpose.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/OPENMP/Transpose/transpose.c b/OPENMP/Transpose/transpose.c index f89623278..70b67178c 100644 --- a/OPENMP/Transpose/transpose.c +++ b/OPENMP/Transpose/transpose.c @@ -42,7 +42,7 @@ USAGE: Program input is three command line arguments that give the matrix order, the number of times to repeat the operation (iterations), and the number of threads to use: - transpose <# threads> <# iterations> [tile size] + transpose <# threads> <# iterations> [tile size] An optional parameter specifies the tile size used to divide the individual matrix blocks for improved cache and TLB performance. From b7bddfcfd2398a5b5908d2012d084eaecd571a26 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 12 Apr 2018 12:21:07 -0700 Subject: [PATCH 073/245] PGI+GPU cleanup (#328) fix a bunch of things * add OACC p2p w/ innerloop strategy * git ignore binaries * add attempt at OACC async (tasks) * PGI fixes * PGI fixes * add OACC p2p in C++ * add OACC C++ p2p to Travis * explicit grid size because PGI 17.4 complained * clean ornlacc binaries * add comment to help user * cleanup printout * ignore C++ header checking with PGI * fix pcopy bug * add CUDA to PGI example; add flag that fixes p2p issue * add PGI example flags for PSTL/RAJA/KOKKOS (only STL tested) * add warning about correctness issue --- .gitignore | 3 ++ Cxx11/Makefile | 1 + Cxx11/p2p-hyperplane-vector-ornlacc.cc | 2 +- Cxx11/prk_util.h | 2 +- Cxx11/stencil-cuda.cu | 3 ++ FORTRAN/nstream-openmp-target.f90 | 11 +++--- FORTRAN/nstream-ornlacc.f90 | 13 ++------ FORTRAN/nstream-pretty.f90 | 6 ++-- FORTRAN/nstream-taskloop-openmp.f90 | 8 ++--- FORTRAN/nstream.f90 | 18 ++++++---- FORTRAN/p2p-innerloop-ornlacc.f90 | 1 + common/make.defs.pgi | 46 ++++++++++++++++++++++++-- 12 files changed, 81 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index 259f7766b..d5100141d 100644 --- a/.gitignore +++ b/.gitignore @@ -210,6 +210,7 @@ FORTRAN/nstream-openmp FORTRAN/nstream-pretty FORTRAN/nstream-taskloop-openmp FORTRAN/nstream-openmp-target +FORTRAN/nstream-ornlacc FORTRAN/p2p FORTRAN/p2p-innerloop FORTRAN/p2p-coarray @@ -225,6 +226,7 @@ FORTRAN/stencil-coarray FORTRAN/stencil-openmp FORTRAN/stencil-openmp-target FORTRAN/stencil-pretty +FORTRAN/stencil-ornlacc FORTRAN/stencil-taskloop-openmp FORTRAN/stencil-ornlacc FORTRAN/transpose @@ -232,6 +234,7 @@ FORTRAN/transpose-coarray FORTRAN/transpose-openmp FORTRAN/transpose-openmp-target FORTRAN/transpose-pretty +FORTRAN/transpose-ornlacc FORTRAN/transpose-taskloop-openmp FORTRAN/transpose-tasks-openmp FORTRAN/transpose-ornlacc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 99c2b34b8..3af1ad2e7 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -221,6 +221,7 @@ clean: -rm -f *-cblas -rm -f *-occa -rm -f *-boost-compute + -rm -f *-ornlacc -rm -f transpose-vector-async transpose-vector-thread cleancl: diff --git a/Cxx11/p2p-hyperplane-vector-ornlacc.cc b/Cxx11/p2p-hyperplane-vector-ornlacc.cc index eb4a092e1..05aac1ced 100644 --- a/Cxx11/p2p-hyperplane-vector-ornlacc.cc +++ b/Cxx11/p2p-hyperplane-vector-ornlacc.cc @@ -132,7 +132,7 @@ int main(int argc, char* argv[]) grid[i*n+0] = static_cast(i); } - #pragma acc data pcopy(grid) + #pragma acc data pcopy(grid[0:n*n]) { for (auto iter = 0; iter<=iterations; iter++) { diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 0109ba684..8bb718fe0 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -40,7 +40,7 @@ #include // Test standard library _after_ standard headers have been included... -#if !defined(__NVCC__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI) +#if !defined(__NVCC__) && !defined(__PGI) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI) # error You are using an ancient version GNU libstdc++. Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option. #endif diff --git a/Cxx11/stencil-cuda.cu b/Cxx11/stencil-cuda.cu index ba544ada7..fcfe9e48c 100644 --- a/Cxx11/stencil-cuda.cu +++ b/Cxx11/stencil-cuda.cu @@ -122,6 +122,9 @@ int main(int argc, char* argv[]) tile_size = std::atoi(argv[3]); if (tile_size <= 0) tile_size = n; if (tile_size > n) tile_size = n; + if (tile_size > 32) { + std::cout << "Warning: tile_size > 32 may lead to incorrect results (observed for CUDA 9.0 on GV100).\n"; + } } // stencil pattern diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90 index fd98f717e..96c4b1679 100644 --- a/FORTRAN/nstream-openmp-target.f90 +++ b/FORTRAN/nstream-openmp-target.f90 @@ -134,10 +134,10 @@ program main endif endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix length = ', length - write(*,'(a,i8)') 'Offset = ', offset + write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Matrix length = ', length + write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -189,10 +189,11 @@ program main enddo ! iterations t1 = omp_get_wtime() - nstream_time = t1 - t0 !$omp end target data + nstream_time = t1 - t0 + ! ******************************************************************** ! ** Analyze and output results. ! ******************************************************************** diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.f90 index e6d073947..033dee814 100644 --- a/FORTRAN/nstream-ornlacc.f90 +++ b/FORTRAN/nstream-ornlacc.f90 @@ -133,9 +133,9 @@ program main endif endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix length = ', length - write(*,'(a,i8)') 'Offset = ', offset + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Vector length = ', length + write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -163,13 +163,6 @@ program main t0 = 0 -#ifdef _OPENMP -!$omp parallel default(none) & -!$omp& shared(A,B,C,t0,t1) & -!$omp& firstprivate(length,iterations,offset,scalar) & -!$omp& private(i,k) -#endif - !$acc parallel loop gang do i=1,length A(i) = 0 diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.f90 index be0fb217d..a15e365ec 100644 --- a/FORTRAN/nstream-pretty.f90 +++ b/FORTRAN/nstream-pretty.f90 @@ -133,9 +133,9 @@ program main endif endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix length = ', length - write(*,'(a,i8)') 'Offset = ', offset + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Vector length = ', length + write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space for the input and transpose matrix diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.f90 index 553a220fc..636e45d73 100644 --- a/FORTRAN/nstream-taskloop-openmp.f90 +++ b/FORTRAN/nstream-taskloop-openmp.f90 @@ -124,10 +124,10 @@ program main endif endif - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix length = ', length - write(*,'(a,i8)') 'Offset = ', offset + write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Matrix length = ', length + write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space for the input and transpose matrix diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90 index 63986ab54..9d35024b5 100644 --- a/FORTRAN/nstream.f90 +++ b/FORTRAN/nstream.f90 @@ -141,11 +141,11 @@ program main endif #ifdef _OPENMP - write(*,'(a,i8)') 'Number of threads = ',omp_get_max_threads() + write(*,'(a,i12)') 'Number of threads = ', omp_get_max_threads() #endif - write(*,'(a,i8)') 'Number of iterations = ', iterations - write(*,'(a,i8)') 'Matrix length = ', length - write(*,'(a,i8)') 'Offset = ', offset + write(*,'(a,i12)') 'Number of iterations = ', iterations + write(*,'(a,i12)') 'Vector length = ', length + write(*,'(a,i12)') 'Offset = ', offset ! ******************************************************************** ! ** Allocate space for the input and transpose matrix @@ -212,10 +212,10 @@ program main #ifdef _OPENMP !$omp barrier !$omp master -#endif - t0 = prk_get_wtime() -#ifdef _OPENMP + t0 = omp_get_wtime() !$omp end master +#else + t0 = prk_get_wtime() #endif endif @@ -236,7 +236,11 @@ program main #endif enddo ! iterations +#ifdef _OPENMP + t1 = omp_get_wtime() +#else t1 = prk_get_wtime() +#endif #ifdef _OPENMP !$omp end parallel diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90 index 9e5ff8da7..32c24a4d4 100644 --- a/FORTRAN/p2p-innerloop-ornlacc.f90 +++ b/FORTRAN/p2p-innerloop-ornlacc.f90 @@ -107,6 +107,7 @@ program main if (n .gt. 16384) then write(*,'(a,i5)') 'WARNING: grid size exceeds 16384: ', n write(*,'(a)') 'PGI 17.10 + CUDA 9.0 generates illegal address' + write(*,'(a)') 'unless you compiled with -Mlarge_arrays.' endif if (iterations .lt. 1) then diff --git a/common/make.defs.pgi b/common/make.defs.pgi index ca8fb2d45..ddaf99a69 100644 --- a/common/make.defs.pgi +++ b/common/make.defs.pgi @@ -19,7 +19,9 @@ DEFAULT_OPT_FLAGS=-O2 -tp=haswell # OPENMPFLAG=-mp #-Minfo=mp,vect OFFLOADFLAG=-mp #-Minfo=mp,vect -ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel +#ORNLACCFLAG=-acc -ta=multicore -Minfo=accel +ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel +ORNLACCFLAG+=-Mlarge_arrays # # OpenCL flags # @@ -27,7 +29,47 @@ ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel #OPENCLFLAG=-framework OpenCL # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools -OPENCLFLAG=-I$OPENCLDIR -L$OPENCLDIR/lib64 -lOpenCL +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# +# Parallel STL, Boost, etc. +# +BOOSTFLAG=-DUSE_BOOST -I. +PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} +KOKKOSDIR=./kokkos +KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +RAJADIR=./raja +RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +# +# CUDA flags +# +# Linux w/ NVIDIA CUDA +# NVCC never supports the latest GCC. +# Use appropriate arch or code is compiled to ancient features. +#NVCC=nvcc --compiler-bindir= --gpu-architecture=sm_61 +NVCC=nvcc --gpu-architecture=sm_61 +CUDAFLAGS=-g -O3 -std=c++11 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +# heavy hammer: +#CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED +# big hammers: +#CUDAFLAGS+=-D_IMMINTRIN_H_INCLUDED +#CUDAFLAGS+=-D_FMA4INTRIN_H_INCLUDED +#CUDAFLAGS+=-D_XOPMMINTRIN_H_INCLUDED +# many tiny hammers: +CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512FINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512BWINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512DQINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLBWINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VBMIVLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VBMIINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512VLDQINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512CDINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512PFINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED +CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED # # MPI # From 68eb648f36ff627d8c47d5775c94821d9ed47cb4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 12 Apr 2018 12:21:42 -0700 Subject: [PATCH 074/245] CUDA nstream type cleanup [ci skip] (#329) --- Cxx11/nstream-cuda.cu | 7 +++---- Cxx11/transpose-cuda.cu | 11 +++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu index 646d637b0..4597021bb 100644 --- a/Cxx11/nstream-cuda.cu +++ b/Cxx11/nstream-cuda.cu @@ -66,7 +66,7 @@ __global__ void nstream(const unsigned n, const prk_float scalar, prk_float * A, const prk_float * B, const prk_float * C) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { A[i] += B[i] + scalar * C[i]; } @@ -156,8 +156,7 @@ int main(int argc, char * argv[]) prk::CUDA::check( cudaMemcpy(d_B, &(h_B[0]), bytes, cudaMemcpyHostToDevice) ); prk::CUDA::check( cudaMemcpy(d_C, &(h_C[0]), bytes, cudaMemcpyHostToDevice) ); - double scalar(3); - + prk_float scalar(3); { for (auto iter = 0; iter<=iterations; iter++) { @@ -215,7 +214,7 @@ int main(int argc, char * argv[]) } else { std::cout << "Solution validates" << std::endl; double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(double); + double nbytes = 4.0 * length * sizeof(prk_float); std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime << " Avg time (s): " << avgtime << std::endl; } diff --git a/Cxx11/transpose-cuda.cu b/Cxx11/transpose-cuda.cu index 1efdab462..289fbb1a3 100644 --- a/Cxx11/transpose-cuda.cu +++ b/Cxx11/transpose-cuda.cu @@ -149,9 +149,9 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; #if TILED - std::cout << "Tile size = " << tile_dim << std::endl; + std::cout << "Tile size = " << tile_dim << std::endl; #else - std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; #endif #if TILED @@ -224,9 +224,8 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - // TODO: replace with std::generate, std::accumulate, or similar - const auto addit = (iterations+1.) * (iterations/2.); - auto abserr = 0.0; + const double addit = (iterations+1.) * (iterations/2.); + double abserr(0); for (auto j=0; j Date: Thu, 19 Apr 2018 11:25:06 -0700 Subject: [PATCH 075/245] DGEMM batched (#330) * add support for batched BLAS in CBLAS implementation user requests batchsize n: 0 = no batching -n = loops over legacy BLAS n = uses batched BLAS currently this is only tested with MKL * cosmetic changes * add batched BLAS * add async and proper error checking to batched CUBLAS * add OpenMP to loop over GEMMs * update LLVM example * support non-MKL, better printout --- Cxx11/dgemm-cblas.cc | 215 +++++++++++++++++++++++++++++++----------- Cxx11/dgemm-cublas.cu | 180 ++++++++++++++++++++++++++--------- common/make.defs.llvm | 16 ++-- 3 files changed, 300 insertions(+), 111 deletions(-) diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index fc5709812..cb0e44f51 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -1,5 +1,5 @@ /// -/// Copyright (c) 2017, Intel Corporation +/// Copyright (c) 2018, Intel Corporation /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -41,7 +41,7 @@ /// is carried out, and, optionally, a tile size for matrix /// blocking /// -/// <# iterations> [] +/// <# iterations> [] /// /// The output consists of diagnostics to make sure the /// algorithm worked, and of timing statistics. @@ -51,7 +51,8 @@ /// Other than OpenMP or standard C functions, the following /// functions are used in this program: /// -/// wtime() +/// cblas_dgemm() +/// cblas_dgemm_batch() /// /// HISTORY: Written by Rob Van der Wijngaart, February 2009. /// Converted to C++11 by Jeff Hammond, December, 2017. @@ -79,9 +80,9 @@ void prk_dgemm_loops(const int order, const std::vector & B, std::vector & C) { - for (auto i=0; i> & A, + const std::vector> & B, + std::vector> & C) +{ + const cblas_int n = order; + const double alpha = 1.0; + const double beta = 1.0; + + for (int b=0; b> & A, + const std::vector> & B, + std::vector> & C) +{ + const cblas_int n = order; + const double alpha = 1.0; + const double beta = 1.0; + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) num_threads(nt) +#endif + for (int b=0; b "; + throw "Usage: <# iterations> [ ]"; } iterations = std::atoi(argv[1]); @@ -128,6 +208,18 @@ int main(int argc, char * argv[]) } else if (order > std::floor(std::sqrt(INT_MAX))) { throw "ERROR: matrix dimension too large - overflow risk"; } + + if (argc>3) { + batches = std::atoi(argv[3]); + } + + if (argc>4) { + batch_threads = std::atoi(argv[4]); + } else { +#ifdef _OPENMP + batch_threads = omp_get_max_threads(); +#endif + } } catch (const char * e) { std::cout << e << std::endl; @@ -136,6 +228,22 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches > 0) { +#ifdef MKL + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; +#else + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; +#endif + } else if (batches < 0) { + if (batch_threads > 1) { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " + << batch_threads << " threads)" << std::endl; + } else { + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; + } + } ////////////////////////////////////////////////////////////////////// /// Allocate space for matrices @@ -143,34 +251,44 @@ int main(int argc, char * argv[]) double dgemm_time(0); - std::vector A(order*order); - std::vector B(order*order); - std::vector C(order*order,0.0); -#ifdef PRK_DEBUG - const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::default_random_engine generator(seed); - std::uniform_real_distribution uniform01(0.0, 1.0); - for (auto i=0; i const M(order*order,0); + std::vector> A(matrices,M); + std::vector> B(matrices,M); + std::vector> C(matrices,M); + for (int b=0; b 0) { + prk_dgemm(order, matrices, pA, pB, pC); + } } dgemm_time = prk::wtime() - dgemm_time; } @@ -179,30 +297,15 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - const auto epsilon = 1.0e-8; - const auto forder = static_cast(order); -#ifdef PRK_DEBUG - std::vector D(order*order,0.0);; - for (auto iter = 0; iter<=iterations; iter++) { - prk_dgemm_loops(order, A, B, D); - } + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); double residuum(0); - for (auto i=0; i epsilon) { - std::cout << i << "," << j << " = " << C[i*order+j] << ", " << D[i*order+j] << "\n"; - } - } + for (int b=0; b <# iterations> [] +/// <# iterations> [] /// /// The output consists of diagnostics to make sure the /// algorithm worked, and of timing statistics. @@ -51,7 +51,8 @@ /// Other than OpenMP or standard C functions, the following /// functions are used in this program: /// -/// wtime() +/// cblasDgemm() +/// cublasDgemmStridedBatched() /// /// HISTORY: Written by Rob Van der Wijngaart, February 2009. /// Converted to C++11 by Jeff Hammond, December, 2017. @@ -61,28 +62,91 @@ #include "prk_util.h" #include "prk_cuda.h" -__global__ void init(unsigned order, double * A, double * B, double * C) +__global__ void init(int order, const int matrices, double * A, double * B, double * C) { - auto i = blockIdx.x * blockDim.x + threadIdx.x; - auto j = blockIdx.y * blockDim.y + threadIdx.y; - - if ((i "; + if (argc < 2) { + throw "Usage: <# iterations> [] []"; } iterations = std::atoi(argv[1]); @@ -115,7 +180,11 @@ int main(int argc, char * argv[]) throw "ERROR: matrix dimension too large - overflow risk"; } - if (argc > 3) { + if (argc>3) { + batches = std::atoi(argv[3]); + } + + if (argc > 4) { input_copy = std::atoi(argv[3]); } } @@ -126,12 +195,19 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches < 0) { + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + } else if (batches > 0) { + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + } std::cout << "Input copy = " << (input_copy ? "yes" : "no") << std::endl; cublasHandle_t h; prk::CUDA::check( cublasCreate(&h) ); - int tile_size = 32; + const int tile_size = 32; dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); dim3 dimBlock(tile_size, tile_size, 1); @@ -143,6 +219,7 @@ int main(int argc, char * argv[]) double dgemm_time(0); + const int matrices = (batches==0 ? 1 : abs(batches)); const size_t nelems = (size_t)order * (size_t)order; const size_t bytes = nelems * sizeof(double); @@ -152,15 +229,15 @@ int main(int argc, char * argv[]) double * h_c; prk::CUDA::check( cudaMallocHost((void**)&h_a, bytes) ); prk::CUDA::check( cudaMallocHost((void**)&h_b, bytes) ); - prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) ); + prk::CUDA::check( cudaMallocHost((void**)&h_c, matrices*bytes) ); // device buffers double * d_a; double * d_b; double * d_c; - prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); - prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_a, matrices*bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_b, matrices*bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_c, matrices*bytes) ); if (input_copy) { @@ -171,14 +248,17 @@ int main(int argc, char * argv[]) } } - prk::CUDA::check( cudaMemcpy(d_a, &(h_a[0]), bytes, cudaMemcpyHostToDevice) ); - prk::CUDA::check( cudaMemcpy(d_b, &(h_b[0]), bytes, cudaMemcpyHostToDevice) ); + for (int b=0; b>>(order, d_c); + init<<>>(order, matrices, d_c); } else { - init<<>>(order, d_a, d_b, d_c); + init<<>>(order, matrices, d_a, d_b, d_c); } @@ -188,26 +268,26 @@ int main(int argc, char * argv[]) if (iter==1) dgemm_time = prk::wtime(); if (input_copy) { - prk::CUDA::check( cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice) ); - prk::CUDA::check( cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice) ); + for (int b=0; b 0) { + prk_bgemm(h, order, matrices, d_a, d_b, d_c); + } } dgemm_time = prk::wtime() - dgemm_time; } // copy output back to host - prk::CUDA::check( cudaMemcpy(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) ); + prk::CUDA::check( cudaMemcpyAsync(&(h_c[0]), d_c, matrices*bytes, cudaMemcpyDeviceToHost) ); prk::CUDA::check( cudaFree(d_c) ); prk::CUDA::check( cudaFree(d_b) ); @@ -218,15 +298,21 @@ int main(int argc, char * argv[]) prk::CUDA::check( cublasDestroy(h) ); + prk::CUDA::check( cudaDeviceSynchronize() ); + ////////////////////////////////////////////////////////////////////// /// Analyze and output results ////////////////////////////////////////////////////////////////////// - const auto epsilon = 1.0e-8; - const auto forder = static_cast(order); - const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); - const auto checksum = prk_reduce( &(h_c[0]), &(h_c[nelems]), 0.0); - const auto residuum = std::abs(checksum-reference)/reference; + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + for (int b=0; b Date: Mon, 23 Apr 2018 11:16:14 +0300 Subject: [PATCH 077/245] tiled rangefor transpose (#332) * add tiling to rangefor transpose * hoist irange constructor --- Cxx11/transpose-vector-rangefor.cc | 50 ++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc index ee0097026..e02047a6d 100644 --- a/Cxx11/transpose-vector-rangefor.cc +++ b/Cxx11/transpose-vector-rangefor.cc @@ -39,7 +39,10 @@ /// USAGE: Program input is the matrix order and the number of times to /// repeat the operation: /// -/// transpose <# iterations> +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. /// /// The output consists of diagnostics to make sure the /// transpose worked and timing statistics. @@ -57,61 +60,72 @@ int main(int argc, char * argv[]) std::cout << "C++11/range-for Matrix transpose: B = A^T" << std::endl; ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters + // Read and test input parameters ////////////////////////////////////////////////////////////////////// int iterations; int order; + int tile_size; try { if (argc < 3) { - throw "Usage: <# iterations> "; + throw "Usage: <# iterations> [tile size]"; } - // number of times to do the transpose iterations = std::atoi(argv[1]); if (iterations < 1) { throw "ERROR: iterations must be >= 1"; } - // order of a the matrix order = std::atoi(argv[2]); if (order <= 0) { throw "ERROR: Matrix Order must be greater than 0"; } else if (order > std::floor(std::sqrt(INT_MAX))) { throw "ERROR: matrix dimension too large - overflow risk"; } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; } catch (const char * e) { std::cout << e << std::endl; return 1; } - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// + auto trans_time = 0.0; + std::vector A(order*order); std::vector B(order*order,0.0); + // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); - auto irange = boost::irange(0,order); - auto jrange = boost::irange(0,order); - - auto trans_time = 0.0; + auto itrange = boost::irange(0,order,tile_size); + auto jtrange = boost::irange(0,order,tile_size); for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = prk::wtime(); - // transpose - for (auto i : irange) { - for (auto j : jrange) { - B[i*order+j] += A[j*order+i]; - A[j*order+i] += 1.0; + for (auto it : itrange) { + auto irange = boost::irange(it,std::min(order,it+tile_size)); + for (auto jt : jtrange) { + auto jrange = boost::irange(jt,std::min(order,jt+tile_size)); + for (auto i : irange) { + for (auto j : jrange) { + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + } + } } } } @@ -124,6 +138,8 @@ int main(int argc, char * argv[]) // TODO: replace with std::generate, std::accumulate, or similar const auto addit = (iterations+1.) * (iterations/2.); auto abserr = 0.0; + auto irange = boost::irange(0,order); + auto jrange = boost::irange(0,order); for (auto i : irange) { for (auto j : jrange) { const int ij = i*order+j; From a0dd2147d9b99e27b15a909c72ad12a2d56e279d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Apr 2018 12:59:46 +0300 Subject: [PATCH 078/245] Kokkos mdrange (#334) * kokkos mdrange working in transpose and stencil * Kokkos MDRange improvements - remove Boost dependency - use auto less, initialize double nicer - whitespace and scoping for init/finalize destructor issue --- Cxx11/Makefile | 6 +- Cxx11/generate-cxx-stencil.py | 8 +- Cxx11/nstream-kokkos.cc | 182 ++++++------ Cxx11/stencil-kokkos.cc | 282 +++++++++--------- Cxx11/stencil_kokkos.hpp | 544 ++++++++++++++++------------------ Cxx11/stencil_openmp.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_pgnu.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_pstl.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_raja.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_rangefor.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_seq.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_stl.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_target.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_taskloop.hpp | 468 ++++++++++++++--------------- Cxx11/stencil_tbb.hpp | 468 ++++++++++++++--------------- Cxx11/transpose-kokkos.cc | 220 +++++++------- 16 files changed, 2944 insertions(+), 2978 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3af1ad2e7..5eb4b1526 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -43,7 +43,7 @@ BOOSTFLAGS = $(BOOSTFLAG) STLFLAGS = $(STLFLAG) $(BOOSTFLAGS) PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS) RAJAFLAGS = $(RAJAFLAG) -KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(BOOSTFLAGS) +KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR @@ -63,7 +63,7 @@ else EXTRA += target endif -all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl occa boost-compute $(EXTRA) +all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ @@ -162,7 +162,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@ %-boost-compute: %-boost-compute.cc prk_util.h - $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@ + $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@ %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 134cd0e89..39e66459a 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -66,9 +66,8 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {\n') elif (model=='kokkos'): src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') - src.write(' Kokkos::parallel_for ( Kokkos::RangePolicy('+str(radius)+',n-'+str(radius)+'), KOKKOS_LAMBDA(const int i) {\n') - src.write(' PRAGMA_SIMD\n') - src.write(' for (auto j='+str(radius)+'; j>({'+str(radius)+','+str(radius)+'},{n-'+str(radius)+',n-'+str(radius)+'},{t,t});\n') + src.write(' Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {\n') elif (model=='cuda'): src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n') src.write(' const int i = blockIdx.x * blockDim.x + threadIdx.x;\n') @@ -82,7 +81,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' PRAGMA_SIMD\n') src.write(' for (auto j=jt; j team_policy; - //typedef Kokkos::TeamPolicy::member_type member_type; - - typedef Kokkos::View vector; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations, offset; - size_t length; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - length = std::atol(argv[2]); - if (length <= 0) { - throw "ERROR: vector length must be positive"; - } - - offset = (argc>3) ? std::atoi(argv[3]) : 0; - if (length <= 0) { - throw "ERROR: offset must be nonnegative"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } + typedef Kokkos::PRK_KOKKOS_BACKEND Space; + //typedef Kokkos::TeamPolicy team_policy; + //typedef Kokkos::TeamPolicy::member_type member_type; + + typedef Kokkos::View vector; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Vector length = " << length << std::endl; - std::cout << "Offset = " << offset << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// - auto nstream_time = 0.0; + double nstream_time(0); - vector A("A", length); - vector B("B", length); - vector C("C", length); + vector A("A", length); + vector B("B", length); + vector C("C", length); - double scalar(3); + const double scalar(3); - { - Kokkos::parallel_for ( length, KOKKOS_LAMBDA(const int i) { - A[i] = 0.0; - B[i] = 2.0; - C[i] = 2.0; - }); + { + Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) { + A[i] = 0.0; + B[i] = 2.0; + C[i] = 2.0; + }); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; ++iter) { - if (iter==1) nstream_time = prk::wtime(); + if (iter==1) nstream_time = prk::wtime(); - Kokkos::parallel_for( length, KOKKOS_LAMBDA(const int i) { - A[i] += B[i] + scalar * C[i]; - }); + Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) { + A[i] += B[i] + scalar * C[i]; + }); + } + nstream_time = prk::wtime() - nstream_time; } - nstream_time = prk::wtime() - nstream_time; - } - ////////////////////////////////////////////////////////////////////// - /// Analyze and output results - ////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// - double ar(0); - double br(2); - double cr(2); - for (auto i=0; i<=iterations; i++) { - ar += br + scalar * cr; - } - - ar *= length; + double ar(0); + double br(2); + double cr(2); + for (int i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } - double asum(0); - Kokkos::parallel_reduce( length, KOKKOS_LAMBDA(const int i, double & inner) { - inner += std::fabs(A(i)); - }, asum); + ar *= length; + + double asum(0); + Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) { + inner += std::fabs(A(i)); + }, asum); + + double epsilon(1.e-8); + if (std::fabs(ar-asum)/asum > epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } - double epsilon(1.e-8); - if (std::fabs(ar-asum)/asum > epsilon) { - std::cout << "Failed Validation on output array\n" - << " Expected checksum: " << ar << "\n" - << " Observed checksum: " << asum << std::endl; - std::cout << "ERROR: solution did not validate" << std::endl; - return 1; - } else { - std::cout << "Solution validates" << std::endl; - double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(double); - std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime - << " Avg time (s): " << avgtime << std::endl; } - Kokkos::finalize(); return 0; diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc index be2514743..d2eb5db2a 100644 --- a/Cxx11/stencil-kokkos.cc +++ b/Cxx11/stencil-kokkos.cc @@ -72,8 +72,6 @@ void nothing(const int n, const int t, matrix & in, matrix & out) { std::cout << "You are trying to use a stencil that does not exist." << std::endl; std::cout << "Please generate the new stencil using the code generator." << std::endl; - // n will never be zero - this is to silence compiler warnings. - if (n==0) std::cout << in.size() << out.size() << std::endl; std::abort(); } @@ -83,166 +81,158 @@ int main(int argc, char* argv[]) std::cout << "C++11/Kokkos Stencil execution on 2D grid" << std::endl; Kokkos::initialize (argc, argv); + { + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } - ////////////////////////////////////////////////////////////////////// - // Process and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations, n, radius, tile_size; - bool star = true; - try { - if (argc < 3) { - throw "Usage: <# iterations> [ ]"; - } - - // number of times to run the algorithm - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - // linear grid dimension - n = std::atoi(argv[2]); - if (n < 1) { - throw "ERROR: grid dimension must be positive"; - } else if (n > std::floor(std::sqrt(INT_MAX))) { - throw "ERROR: grid dimension too large - overflow risk"; - } - - // default tile size for tiling of local transpose - tile_size = 32; - if (argc > 3) { - tile_size = std::atoi(argv[3]); - if (tile_size <= 0) tile_size = n; - if (tile_size > n) tile_size = n; - } - - // stencil pattern - if (argc > 4) { - auto stencil = std::string(argv[4]); - auto grid = std::string("grid"); - star = (stencil == grid) ? false : true; - } - - // stencil radius - radius = 2; - if (argc > 5) { - radius = std::atoi(argv[5]); - } - - if ( (radius < 1) || (2*radius+1 > n) ) { - throw "ERROR: Stencil radius negative or too large"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + std::cout << "Compact representation of stencil loop body" << std::endl; + std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Grid size = " << n << std::endl; - std::cout << "Tile size = " << tile_size << std::endl; - std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; - std::cout << "Radius of stencil = " << radius << std::endl; - std::cout << "Compact representation of stencil loop body" << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; - - auto stencil = nothing; - if (star) { - switch (radius) { - case 1: stencil = star1; break; - case 2: stencil = star2; break; - case 3: stencil = star3; break; - case 4: stencil = star4; break; - case 5: stencil = star5; break; - } - } else { - switch (radius) { - case 1: stencil = grid1; break; - case 2: stencil = grid2; break; - case 3: stencil = grid3; break; - case 4: stencil = grid4; break; - case 5: stencil = grid5; break; - } - } + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// + double stencil_time(0); - auto stencil_time = 0.0; + // row-major 2D array + matrix in("in", n, n); + matrix out("out", n, n); - // row-major 2D array - matrix in("in", n, n); - matrix out("out", n, n); + auto z2 = {0,0}; + auto n2 = {n,n}; + auto tile2 = {tile_size,tile_size}; + auto full = Kokkos::MDRangePolicy>(z2,n2,tile2); - try { - Kokkos::parallel_for ( n,[&] (int i) { - for (auto j=0; j(i+j); - out(i,j) = 0.0; - } + Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { + in(i,j) = static_cast(i+j); + out(i,j) = 0.0; }); - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - catch (std::exception const & e) { - std::cout << e.what() << std::endl; - return 1; - } - - for (auto iter = 0; iter<=iterations; iter++) { - if (iter==1) stencil_time = prk::wtime(); - // Apply the stencil operator - stencil(n, tile_size, in, out); - // Add constant to solution to force refresh of neighbor data, if any - Kokkos::parallel_for ( n,[&] (int i) { - for (auto j=0; j(n-2*radius)*static_cast(n-2*radius); - // compute L1 norm in parallel - double norm = 0.0; - auto inside = boost::irange(radius,n-radius); - for (auto i : inside) { - for (auto j : inside) { - norm += std::fabs(out(i,j)); + Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { + in(i,j) += 1.0; + }); } - } - norm /= active_points; - - // verify correctness - const double epsilon = 1.0e-8; - double reference_norm = 2.*(iterations+1.); - if (std::fabs(norm-reference_norm) > epsilon) { - std::cout << "ERROR: L1 norm = " << norm - << " Reference L1 norm = " << reference_norm << std::endl; - return 1; - } else { - std::cout << "Solution validates" << std::endl; + + stencil_time = prk::wtime() - stencil_time; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + double norm(0); + auto r2 = {radius,radius}; + auto nr2 = {n-radius,n-radius}; + auto inside = Kokkos::MDRangePolicy>(r2,nr2,tile2); + Kokkos::parallel_reduce(inside, KOKKOS_LAMBDA(int i, int j, double & norm) { + norm += std::fabs(out(i,j)); + }, norm); + norm /= active_points; + + // verify correctness + double const epsilon(1.0e-8); + double reference_norm = 2.*(iterations+1.); + if (std::fabs(norm-reference_norm) > epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; #ifdef VERBOSE - std::cout << "L1 norm = " << norm - << " Reference L1 norm = " << reference_norm << std::endl; + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; #endif - const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); - size_t flops = (2L*(size_t)stencil_size+1L) * active_points; - auto avgtime = stencil_time/iterations; - std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime - << " Avg time (s): " << avgtime << std::endl; + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2.*stencil_size+1.) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; } + } Kokkos::finalize(); return 0; diff --git a/Cxx11/stencil_kokkos.hpp b/Cxx11/stencil_kokkos.hpp index 5b67ee4ab..cb5009aae 100644 --- a/Cxx11/stencil_kokkos.hpp +++ b/Cxx11/stencil_kokkos.hpp @@ -1,20 +1,17 @@ void star1(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(1,n-1), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=1; j>({1,1},{n-1,n-1},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-1,j+0) * -0.5 +in(i+0,j+-1) * -0.5 +in(i+0,j+1) * 0.5 +in(i+1,j+0) * 0.5; - } }); } void star2(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(2,n-2), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=2; j>({2,2},{n-2,n-2},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-2,j+0) * -0.125 +in(i+-1,j+0) * -0.25 +in(i+0,j+-2) * -0.125 +in(i+0,j+-1) * -0.25 @@ -22,374 +19,357 @@ void star2(const int n, const int t, matrix & in, matrix & out) { +in(i+0,j+2) * 0.125 +in(i+1,j+0) * 0.25 +in(i+2,j+0) * 0.125; - } }); } void star3(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(3,n-3), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=3; j>({3,3},{n-3,n-3},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-3,j+0) * -0.0555555555556 + +in(i+-2,j+0) * -0.0833333333333 + +in(i+-1,j+0) * -0.166666666667 + +in(i+0,j+-3) * -0.0555555555556 + +in(i+0,j+-2) * -0.0833333333333 + +in(i+0,j+-1) * -0.166666666667 + +in(i+0,j+1) * 0.166666666667 + +in(i+0,j+2) * 0.0833333333333 + +in(i+0,j+3) * 0.0555555555556 + +in(i+1,j+0) * 0.166666666667 + +in(i+2,j+0) * 0.0833333333333 + +in(i+3,j+0) * 0.0555555555556; }); } void star4(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(4,n-4), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=4; j>({4,4},{n-4,n-4},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-4,j+0) * -0.03125 + +in(i+-3,j+0) * -0.0416666666667 +in(i+-2,j+0) * -0.0625 +in(i+-1,j+0) * -0.125 +in(i+0,j+-4) * -0.03125 - +in(i+0,j+-3) * -0.041666666666666664 + +in(i+0,j+-3) * -0.0416666666667 +in(i+0,j+-2) * -0.0625 +in(i+0,j+-1) * -0.125 +in(i+0,j+1) * 0.125 +in(i+0,j+2) * 0.0625 - +in(i+0,j+3) * 0.041666666666666664 + +in(i+0,j+3) * 0.0416666666667 +in(i+0,j+4) * 0.03125 +in(i+1,j+0) * 0.125 +in(i+2,j+0) * 0.0625 - +in(i+3,j+0) * 0.041666666666666664 + +in(i+3,j+0) * 0.0416666666667 +in(i+4,j+0) * 0.03125; - } }); } void star5(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(5,n-5), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=5; j>({5,5},{n-5,n-5},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-5,j+0) * -0.02 +in(i+-4,j+0) * -0.025 - +in(i+-3,j+0) * -0.03333333333333333 + +in(i+-3,j+0) * -0.0333333333333 +in(i+-2,j+0) * -0.05 +in(i+-1,j+0) * -0.1 +in(i+0,j+-5) * -0.02 +in(i+0,j+-4) * -0.025 - +in(i+0,j+-3) * -0.03333333333333333 + +in(i+0,j+-3) * -0.0333333333333 +in(i+0,j+-2) * -0.05 +in(i+0,j+-1) * -0.1 +in(i+0,j+1) * 0.1 +in(i+0,j+2) * 0.05 - +in(i+0,j+3) * 0.03333333333333333 + +in(i+0,j+3) * 0.0333333333333 +in(i+0,j+4) * 0.025 +in(i+0,j+5) * 0.02 +in(i+1,j+0) * 0.1 +in(i+2,j+0) * 0.05 - +in(i+3,j+0) * 0.03333333333333333 + +in(i+3,j+0) * 0.0333333333333 +in(i+4,j+0) * 0.025 +in(i+5,j+0) * 0.02; - } }); } void grid1(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(1,n-1), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=1; j>({1,1},{n-1,n-1},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-1,j+-1) * -0.25 +in(i+-1,j+0) * -0.25 +in(i+0,j+-1) * -0.25 +in(i+0,j+1) * 0.25 +in(i+1,j+0) * 0.25 +in(i+1,j+1) * 0.25 ; - } }); } void grid2(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(2,n-2), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=2; j>({2,2},{n-2,n-2},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-2,j+-2) * -0.0625 + +in(i+-2,j+-1) * -0.0208333333333 + +in(i+-2,j+0) * -0.0208333333333 + +in(i+-2,j+1) * -0.0208333333333 + +in(i+-1,j+-2) * -0.0208333333333 +in(i+-1,j+-1) * -0.125 +in(i+-1,j+0) * -0.125 - +in(i+-1,j+2) * 0.020833333333333332 - +in(i+0,j+-2) * -0.020833333333333332 + +in(i+-1,j+2) * 0.0208333333333 + +in(i+0,j+-2) * -0.0208333333333 +in(i+0,j+-1) * -0.125 +in(i+0,j+1) * 0.125 - +in(i+0,j+2) * 0.020833333333333332 - +in(i+1,j+-2) * -0.020833333333333332 + +in(i+0,j+2) * 0.0208333333333 + +in(i+1,j+-2) * -0.0208333333333 +in(i+1,j+0) * 0.125 +in(i+1,j+1) * 0.125 - +in(i+1,j+2) * 0.020833333333333332 - +in(i+2,j+-1) * 0.020833333333333332 - +in(i+2,j+0) * 0.020833333333333332 - +in(i+2,j+1) * 0.020833333333333332 + +in(i+1,j+2) * 0.0208333333333 + +in(i+2,j+-1) * 0.0208333333333 + +in(i+2,j+0) * 0.0208333333333 + +in(i+2,j+1) * 0.0208333333333 +in(i+2,j+2) * 0.0625 ; - } }); } void grid3(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(3,n-3), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=3; j>({3,3},{n-3,n-3},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-3,j+-3) * -0.0277777777778 + +in(i+-3,j+-2) * -0.00555555555556 + +in(i+-3,j+-1) * -0.00555555555556 + +in(i+-3,j+0) * -0.00555555555556 + +in(i+-3,j+1) * -0.00555555555556 + +in(i+-3,j+2) * -0.00555555555556 + +in(i+-2,j+-3) * -0.00555555555556 + +in(i+-2,j+-2) * -0.0416666666667 + +in(i+-2,j+-1) * -0.0138888888889 + +in(i+-2,j+0) * -0.0138888888889 + +in(i+-2,j+1) * -0.0138888888889 + +in(i+-2,j+3) * 0.00555555555556 + +in(i+-1,j+-3) * -0.00555555555556 + +in(i+-1,j+-2) * -0.0138888888889 + +in(i+-1,j+-1) * -0.0833333333333 + +in(i+-1,j+0) * -0.0833333333333 + +in(i+-1,j+2) * 0.0138888888889 + +in(i+-1,j+3) * 0.00555555555556 + +in(i+0,j+-3) * -0.00555555555556 + +in(i+0,j+-2) * -0.0138888888889 + +in(i+0,j+-1) * -0.0833333333333 + +in(i+0,j+1) * 0.0833333333333 + +in(i+0,j+2) * 0.0138888888889 + +in(i+0,j+3) * 0.00555555555556 + +in(i+1,j+-3) * -0.00555555555556 + +in(i+1,j+-2) * -0.0138888888889 + +in(i+1,j+0) * 0.0833333333333 + +in(i+1,j+1) * 0.0833333333333 + +in(i+1,j+2) * 0.0138888888889 + +in(i+1,j+3) * 0.00555555555556 + +in(i+2,j+-3) * -0.00555555555556 + +in(i+2,j+-1) * 0.0138888888889 + +in(i+2,j+0) * 0.0138888888889 + +in(i+2,j+1) * 0.0138888888889 + +in(i+2,j+2) * 0.0416666666667 + +in(i+2,j+3) * 0.00555555555556 + +in(i+3,j+-2) * 0.00555555555556 + +in(i+3,j+-1) * 0.00555555555556 + +in(i+3,j+0) * 0.00555555555556 + +in(i+3,j+1) * 0.00555555555556 + +in(i+3,j+2) * 0.00555555555556 + +in(i+3,j+3) * 0.0277777777778 ; - } }); } void grid4(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(4,n-4), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=4; j>({4,4},{n-4,n-4},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-4,j+-4) * -0.015625 + +in(i+-4,j+-3) * -0.00223214285714 + +in(i+-4,j+-2) * -0.00223214285714 + +in(i+-4,j+-1) * -0.00223214285714 + +in(i+-4,j+0) * -0.00223214285714 + +in(i+-4,j+1) * -0.00223214285714 + +in(i+-4,j+2) * -0.00223214285714 + +in(i+-4,j+3) * -0.00223214285714 + +in(i+-3,j+-4) * -0.00223214285714 + +in(i+-3,j+-3) * -0.0208333333333 + +in(i+-3,j+-2) * -0.00416666666667 + +in(i+-3,j+-1) * -0.00416666666667 + +in(i+-3,j+0) * -0.00416666666667 + +in(i+-3,j+1) * -0.00416666666667 + +in(i+-3,j+2) * -0.00416666666667 + +in(i+-3,j+4) * 0.00223214285714 + +in(i+-2,j+-4) * -0.00223214285714 + +in(i+-2,j+-3) * -0.00416666666667 +in(i+-2,j+-2) * -0.03125 - +in(i+-2,j+-1) * -0.010416666666666666 - +in(i+-2,j+0) * -0.010416666666666666 - +in(i+-2,j+1) * -0.010416666666666666 - +in(i+-2,j+3) * 0.004166666666666667 - +in(i+-2,j+4) * 0.002232142857142857 - +in(i+-1,j+-4) * -0.002232142857142857 - +in(i+-1,j+-3) * -0.004166666666666667 - +in(i+-1,j+-2) * -0.010416666666666666 + +in(i+-2,j+-1) * -0.0104166666667 + +in(i+-2,j+0) * -0.0104166666667 + +in(i+-2,j+1) * -0.0104166666667 + +in(i+-2,j+3) * 0.00416666666667 + +in(i+-2,j+4) * 0.00223214285714 + +in(i+-1,j+-4) * -0.00223214285714 + +in(i+-1,j+-3) * -0.00416666666667 + +in(i+-1,j+-2) * -0.0104166666667 +in(i+-1,j+-1) * -0.0625 +in(i+-1,j+0) * -0.0625 - +in(i+-1,j+2) * 0.010416666666666666 - +in(i+-1,j+3) * 0.004166666666666667 - +in(i+-1,j+4) * 0.002232142857142857 - +in(i+0,j+-4) * -0.002232142857142857 - +in(i+0,j+-3) * -0.004166666666666667 - +in(i+0,j+-2) * -0.010416666666666666 + +in(i+-1,j+2) * 0.0104166666667 + +in(i+-1,j+3) * 0.00416666666667 + +in(i+-1,j+4) * 0.00223214285714 + +in(i+0,j+-4) * -0.00223214285714 + +in(i+0,j+-3) * -0.00416666666667 + +in(i+0,j+-2) * -0.0104166666667 +in(i+0,j+-1) * -0.0625 +in(i+0,j+1) * 0.0625 - +in(i+0,j+2) * 0.010416666666666666 - +in(i+0,j+3) * 0.004166666666666667 - +in(i+0,j+4) * 0.002232142857142857 - +in(i+1,j+-4) * -0.002232142857142857 - +in(i+1,j+-3) * -0.004166666666666667 - +in(i+1,j+-2) * -0.010416666666666666 + +in(i+0,j+2) * 0.0104166666667 + +in(i+0,j+3) * 0.00416666666667 + +in(i+0,j+4) * 0.00223214285714 + +in(i+1,j+-4) * -0.00223214285714 + +in(i+1,j+-3) * -0.00416666666667 + +in(i+1,j+-2) * -0.0104166666667 +in(i+1,j+0) * 0.0625 +in(i+1,j+1) * 0.0625 - +in(i+1,j+2) * 0.010416666666666666 - +in(i+1,j+3) * 0.004166666666666667 - +in(i+1,j+4) * 0.002232142857142857 - +in(i+2,j+-4) * -0.002232142857142857 - +in(i+2,j+-3) * -0.004166666666666667 - +in(i+2,j+-1) * 0.010416666666666666 - +in(i+2,j+0) * 0.010416666666666666 - +in(i+2,j+1) * 0.010416666666666666 + +in(i+1,j+2) * 0.0104166666667 + +in(i+1,j+3) * 0.00416666666667 + +in(i+1,j+4) * 0.00223214285714 + +in(i+2,j+-4) * -0.00223214285714 + +in(i+2,j+-3) * -0.00416666666667 + +in(i+2,j+-1) * 0.0104166666667 + +in(i+2,j+0) * 0.0104166666667 + +in(i+2,j+1) * 0.0104166666667 +in(i+2,j+2) * 0.03125 - +in(i+2,j+3) * 0.004166666666666667 - +in(i+2,j+4) * 0.002232142857142857 - +in(i+3,j+-4) * -0.002232142857142857 - +in(i+3,j+-2) * 0.004166666666666667 - +in(i+3,j+-1) * 0.004166666666666667 - +in(i+3,j+0) * 0.004166666666666667 - +in(i+3,j+1) * 0.004166666666666667 - +in(i+3,j+2) * 0.004166666666666667 - +in(i+3,j+3) * 0.020833333333333332 - +in(i+3,j+4) * 0.002232142857142857 - +in(i+4,j+-3) * 0.002232142857142857 - +in(i+4,j+-2) * 0.002232142857142857 - +in(i+4,j+-1) * 0.002232142857142857 - +in(i+4,j+0) * 0.002232142857142857 - +in(i+4,j+1) * 0.002232142857142857 - +in(i+4,j+2) * 0.002232142857142857 - +in(i+4,j+3) * 0.002232142857142857 + +in(i+2,j+3) * 0.00416666666667 + +in(i+2,j+4) * 0.00223214285714 + +in(i+3,j+-4) * -0.00223214285714 + +in(i+3,j+-2) * 0.00416666666667 + +in(i+3,j+-1) * 0.00416666666667 + +in(i+3,j+0) * 0.00416666666667 + +in(i+3,j+1) * 0.00416666666667 + +in(i+3,j+2) * 0.00416666666667 + +in(i+3,j+3) * 0.0208333333333 + +in(i+3,j+4) * 0.00223214285714 + +in(i+4,j+-3) * 0.00223214285714 + +in(i+4,j+-2) * 0.00223214285714 + +in(i+4,j+-1) * 0.00223214285714 + +in(i+4,j+0) * 0.00223214285714 + +in(i+4,j+1) * 0.00223214285714 + +in(i+4,j+2) * 0.00223214285714 + +in(i+4,j+3) * 0.00223214285714 +in(i+4,j+4) * 0.015625 ; - } }); } void grid5(const int n, const int t, matrix & in, matrix & out) { - Kokkos::parallel_for ( Kokkos::RangePolicy(5,n-5), KOKKOS_LAMBDA(const int i) { - PRAGMA_SIMD - for (auto j=5; j>({5,5},{n-5,n-5},{t,t}); + Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { + out(i,j) += +in(i+-5,j+-5) * -0.01 + +in(i+-5,j+-4) * -0.00111111111111 + +in(i+-5,j+-3) * -0.00111111111111 + +in(i+-5,j+-2) * -0.00111111111111 + +in(i+-5,j+-1) * -0.00111111111111 + +in(i+-5,j+0) * -0.00111111111111 + +in(i+-5,j+1) * -0.00111111111111 + +in(i+-5,j+2) * -0.00111111111111 + +in(i+-5,j+3) * -0.00111111111111 + +in(i+-5,j+4) * -0.00111111111111 + +in(i+-4,j+-5) * -0.00111111111111 +in(i+-4,j+-4) * -0.0125 - +in(i+-4,j+-3) * -0.0017857142857142857 - +in(i+-4,j+-2) * -0.0017857142857142857 - +in(i+-4,j+-1) * -0.0017857142857142857 - +in(i+-4,j+0) * -0.0017857142857142857 - +in(i+-4,j+1) * -0.0017857142857142857 - +in(i+-4,j+2) * -0.0017857142857142857 - +in(i+-4,j+3) * -0.0017857142857142857 - +in(i+-4,j+5) * 0.0011111111111111111 - +in(i+-3,j+-5) * -0.0011111111111111111 - +in(i+-3,j+-4) * -0.0017857142857142857 - +in(i+-3,j+-3) * -0.016666666666666666 - +in(i+-3,j+-2) * -0.0033333333333333335 - +in(i+-3,j+-1) * -0.0033333333333333335 - +in(i+-3,j+0) * -0.0033333333333333335 - +in(i+-3,j+1) * -0.0033333333333333335 - +in(i+-3,j+2) * -0.0033333333333333335 - +in(i+-3,j+4) * 0.0017857142857142857 - +in(i+-3,j+5) * 0.0011111111111111111 - +in(i+-2,j+-5) * -0.0011111111111111111 - +in(i+-2,j+-4) * -0.0017857142857142857 - +in(i+-2,j+-3) * -0.0033333333333333335 + +in(i+-4,j+-3) * -0.00178571428571 + +in(i+-4,j+-2) * -0.00178571428571 + +in(i+-4,j+-1) * -0.00178571428571 + +in(i+-4,j+0) * -0.00178571428571 + +in(i+-4,j+1) * -0.00178571428571 + +in(i+-4,j+2) * -0.00178571428571 + +in(i+-4,j+3) * -0.00178571428571 + +in(i+-4,j+5) * 0.00111111111111 + +in(i+-3,j+-5) * -0.00111111111111 + +in(i+-3,j+-4) * -0.00178571428571 + +in(i+-3,j+-3) * -0.0166666666667 + +in(i+-3,j+-2) * -0.00333333333333 + +in(i+-3,j+-1) * -0.00333333333333 + +in(i+-3,j+0) * -0.00333333333333 + +in(i+-3,j+1) * -0.00333333333333 + +in(i+-3,j+2) * -0.00333333333333 + +in(i+-3,j+4) * 0.00178571428571 + +in(i+-3,j+5) * 0.00111111111111 + +in(i+-2,j+-5) * -0.00111111111111 + +in(i+-2,j+-4) * -0.00178571428571 + +in(i+-2,j+-3) * -0.00333333333333 +in(i+-2,j+-2) * -0.025 - +in(i+-2,j+-1) * -0.008333333333333333 - +in(i+-2,j+0) * -0.008333333333333333 - +in(i+-2,j+1) * -0.008333333333333333 - +in(i+-2,j+3) * 0.0033333333333333335 - +in(i+-2,j+4) * 0.0017857142857142857 - +in(i+-2,j+5) * 0.0011111111111111111 - +in(i+-1,j+-5) * -0.0011111111111111111 - +in(i+-1,j+-4) * -0.0017857142857142857 - +in(i+-1,j+-3) * -0.0033333333333333335 - +in(i+-1,j+-2) * -0.008333333333333333 + +in(i+-2,j+-1) * -0.00833333333333 + +in(i+-2,j+0) * -0.00833333333333 + +in(i+-2,j+1) * -0.00833333333333 + +in(i+-2,j+3) * 0.00333333333333 + +in(i+-2,j+4) * 0.00178571428571 + +in(i+-2,j+5) * 0.00111111111111 + +in(i+-1,j+-5) * -0.00111111111111 + +in(i+-1,j+-4) * -0.00178571428571 + +in(i+-1,j+-3) * -0.00333333333333 + +in(i+-1,j+-2) * -0.00833333333333 +in(i+-1,j+-1) * -0.05 +in(i+-1,j+0) * -0.05 - +in(i+-1,j+2) * 0.008333333333333333 - +in(i+-1,j+3) * 0.0033333333333333335 - +in(i+-1,j+4) * 0.0017857142857142857 - +in(i+-1,j+5) * 0.0011111111111111111 - +in(i+0,j+-5) * -0.0011111111111111111 - +in(i+0,j+-4) * -0.0017857142857142857 - +in(i+0,j+-3) * -0.0033333333333333335 - +in(i+0,j+-2) * -0.008333333333333333 + +in(i+-1,j+2) * 0.00833333333333 + +in(i+-1,j+3) * 0.00333333333333 + +in(i+-1,j+4) * 0.00178571428571 + +in(i+-1,j+5) * 0.00111111111111 + +in(i+0,j+-5) * -0.00111111111111 + +in(i+0,j+-4) * -0.00178571428571 + +in(i+0,j+-3) * -0.00333333333333 + +in(i+0,j+-2) * -0.00833333333333 +in(i+0,j+-1) * -0.05 +in(i+0,j+1) * 0.05 - +in(i+0,j+2) * 0.008333333333333333 - +in(i+0,j+3) * 0.0033333333333333335 - +in(i+0,j+4) * 0.0017857142857142857 - +in(i+0,j+5) * 0.0011111111111111111 - +in(i+1,j+-5) * -0.0011111111111111111 - +in(i+1,j+-4) * -0.0017857142857142857 - +in(i+1,j+-3) * -0.0033333333333333335 - +in(i+1,j+-2) * -0.008333333333333333 + +in(i+0,j+2) * 0.00833333333333 + +in(i+0,j+3) * 0.00333333333333 + +in(i+0,j+4) * 0.00178571428571 + +in(i+0,j+5) * 0.00111111111111 + +in(i+1,j+-5) * -0.00111111111111 + +in(i+1,j+-4) * -0.00178571428571 + +in(i+1,j+-3) * -0.00333333333333 + +in(i+1,j+-2) * -0.00833333333333 +in(i+1,j+0) * 0.05 +in(i+1,j+1) * 0.05 - +in(i+1,j+2) * 0.008333333333333333 - +in(i+1,j+3) * 0.0033333333333333335 - +in(i+1,j+4) * 0.0017857142857142857 - +in(i+1,j+5) * 0.0011111111111111111 - +in(i+2,j+-5) * -0.0011111111111111111 - +in(i+2,j+-4) * -0.0017857142857142857 - +in(i+2,j+-3) * -0.0033333333333333335 - +in(i+2,j+-1) * 0.008333333333333333 - +in(i+2,j+0) * 0.008333333333333333 - +in(i+2,j+1) * 0.008333333333333333 + +in(i+1,j+2) * 0.00833333333333 + +in(i+1,j+3) * 0.00333333333333 + +in(i+1,j+4) * 0.00178571428571 + +in(i+1,j+5) * 0.00111111111111 + +in(i+2,j+-5) * -0.00111111111111 + +in(i+2,j+-4) * -0.00178571428571 + +in(i+2,j+-3) * -0.00333333333333 + +in(i+2,j+-1) * 0.00833333333333 + +in(i+2,j+0) * 0.00833333333333 + +in(i+2,j+1) * 0.00833333333333 +in(i+2,j+2) * 0.025 - +in(i+2,j+3) * 0.0033333333333333335 - +in(i+2,j+4) * 0.0017857142857142857 - +in(i+2,j+5) * 0.0011111111111111111 - +in(i+3,j+-5) * -0.0011111111111111111 - +in(i+3,j+-4) * -0.0017857142857142857 - +in(i+3,j+-2) * 0.0033333333333333335 - +in(i+3,j+-1) * 0.0033333333333333335 - +in(i+3,j+0) * 0.0033333333333333335 - +in(i+3,j+1) * 0.0033333333333333335 - +in(i+3,j+2) * 0.0033333333333333335 - +in(i+3,j+3) * 0.016666666666666666 - +in(i+3,j+4) * 0.0017857142857142857 - +in(i+3,j+5) * 0.0011111111111111111 - +in(i+4,j+-5) * -0.0011111111111111111 - +in(i+4,j+-3) * 0.0017857142857142857 - +in(i+4,j+-2) * 0.0017857142857142857 - +in(i+4,j+-1) * 0.0017857142857142857 - +in(i+4,j+0) * 0.0017857142857142857 - +in(i+4,j+1) * 0.0017857142857142857 - +in(i+4,j+2) * 0.0017857142857142857 - +in(i+4,j+3) * 0.0017857142857142857 + +in(i+2,j+3) * 0.00333333333333 + +in(i+2,j+4) * 0.00178571428571 + +in(i+2,j+5) * 0.00111111111111 + +in(i+3,j+-5) * -0.00111111111111 + +in(i+3,j+-4) * -0.00178571428571 + +in(i+3,j+-2) * 0.00333333333333 + +in(i+3,j+-1) * 0.00333333333333 + +in(i+3,j+0) * 0.00333333333333 + +in(i+3,j+1) * 0.00333333333333 + +in(i+3,j+2) * 0.00333333333333 + +in(i+3,j+3) * 0.0166666666667 + +in(i+3,j+4) * 0.00178571428571 + +in(i+3,j+5) * 0.00111111111111 + +in(i+4,j+-5) * -0.00111111111111 + +in(i+4,j+-3) * 0.00178571428571 + +in(i+4,j+-2) * 0.00178571428571 + +in(i+4,j+-1) * 0.00178571428571 + +in(i+4,j+0) * 0.00178571428571 + +in(i+4,j+1) * 0.00178571428571 + +in(i+4,j+2) * 0.00178571428571 + +in(i+4,j+3) * 0.00178571428571 +in(i+4,j+4) * 0.0125 - +in(i+4,j+5) * 0.0011111111111111111 - +in(i+5,j+-4) * 0.0011111111111111111 - +in(i+5,j+-3) * 0.0011111111111111111 - +in(i+5,j+-2) * 0.0011111111111111111 - +in(i+5,j+-1) * 0.0011111111111111111 - +in(i+5,j+0) * 0.0011111111111111111 - +in(i+5,j+1) * 0.0011111111111111111 - +in(i+5,j+2) * 0.0011111111111111111 - +in(i+5,j+3) * 0.0011111111111111111 - +in(i+5,j+4) * 0.0011111111111111111 + +in(i+4,j+5) * 0.00111111111111 + +in(i+5,j+-4) * 0.00111111111111 + +in(i+5,j+-3) * 0.00111111111111 + +in(i+5,j+-2) * 0.00111111111111 + +in(i+5,j+-1) * 0.00111111111111 + +in(i+5,j+0) * 0.00111111111111 + +in(i+5,j+1) * 0.00111111111111 + +in(i+5,j+2) * 0.00111111111111 + +in(i+5,j+3) * 0.00111111111111 + +in(i+5,j+4) * 0.00111111111111 +in(i+5,j+5) * 0.01 ; - } }); } diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp index 2d01b06e7..42edf4570 100644 --- a/Cxx11/stencil_openmp.hpp +++ b/Cxx11/stencil_openmp.hpp @@ -43,18 +43,18 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.05555555555555555 - +in[(i+-2)*n+(j+0)] * -0.08333333333333333 - +in[(i+-1)*n+(j+0)] * -0.16666666666666666 - +in[(i+0)*n+(j+-3)] * -0.05555555555555555 - +in[(i+0)*n+(j+-2)] * -0.08333333333333333 - +in[(i+0)*n+(j+-1)] * -0.16666666666666666 - +in[(i+0)*n+(j+1)] * 0.16666666666666666 - +in[(i+0)*n+(j+2)] * 0.08333333333333333 - +in[(i+0)*n+(j+3)] * 0.05555555555555555 - +in[(i+1)*n+(j+0)] * 0.16666666666666666 - +in[(i+2)*n+(j+0)] * 0.08333333333333333 - +in[(i+3)*n+(j+0)] * 0.05555555555555555; + out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 + +in[(i+-2)*n+(j+0)] * -0.0833333333333 + +in[(i+-1)*n+(j+0)] * -0.166666666667 + +in[(i+0)*n+(j+-3)] * -0.0555555555556 + +in[(i+0)*n+(j+-2)] * -0.0833333333333 + +in[(i+0)*n+(j+-1)] * -0.166666666667 + +in[(i+0)*n+(j+1)] * 0.166666666667 + +in[(i+0)*n+(j+2)] * 0.0833333333333 + +in[(i+0)*n+(j+3)] * 0.0555555555556 + +in[(i+1)*n+(j+0)] * 0.166666666667 + +in[(i+2)*n+(j+0)] * 0.0833333333333 + +in[(i+3)*n+(j+0)] * 0.0555555555556; }); }); } @@ -47,20 +47,20 @@ void star4(const int n, const int t, std::vector & in, std::vector(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 - +in[(i+-3)*n+(j+0)] * -0.041666666666666664 + +in[(i+-3)*n+(j+0)] * -0.0416666666667 +in[(i+-2)*n+(j+0)] * -0.0625 +in[(i+-1)*n+(j+0)] * -0.125 +in[(i+0)*n+(j+-4)] * -0.03125 - +in[(i+0)*n+(j+-3)] * -0.041666666666666664 + +in[(i+0)*n+(j+-3)] * -0.0416666666667 +in[(i+0)*n+(j+-2)] * -0.0625 +in[(i+0)*n+(j+-1)] * -0.125 +in[(i+0)*n+(j+1)] * 0.125 +in[(i+0)*n+(j+2)] * 0.0625 - +in[(i+0)*n+(j+3)] * 0.041666666666666664 + +in[(i+0)*n+(j+3)] * 0.0416666666667 +in[(i+0)*n+(j+4)] * 0.03125 +in[(i+1)*n+(j+0)] * 0.125 +in[(i+2)*n+(j+0)] * 0.0625 - +in[(i+3)*n+(j+0)] * 0.041666666666666664 + +in[(i+3)*n+(j+0)] * 0.0416666666667 +in[(i+4)*n+(j+0)] * 0.03125; }); }); @@ -71,22 +71,22 @@ void star5(const int n, const int t, std::vector & in, std::vector(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 +in[(i+-4)*n+(j+0)] * -0.025 - +in[(i+-3)*n+(j+0)] * -0.03333333333333333 + +in[(i+-3)*n+(j+0)] * -0.0333333333333 +in[(i+-2)*n+(j+0)] * -0.05 +in[(i+-1)*n+(j+0)] * -0.1 +in[(i+0)*n+(j+-5)] * -0.02 +in[(i+0)*n+(j+-4)] * -0.025 - +in[(i+0)*n+(j+-3)] * -0.03333333333333333 + +in[(i+0)*n+(j+-3)] * -0.0333333333333 +in[(i+0)*n+(j+-2)] * -0.05 +in[(i+0)*n+(j+-1)] * -0.1 +in[(i+0)*n+(j+1)] * 0.1 +in[(i+0)*n+(j+2)] * 0.05 - +in[(i+0)*n+(j+3)] * 0.03333333333333333 + +in[(i+0)*n+(j+3)] * 0.0333333333333 +in[(i+0)*n+(j+4)] * 0.025 +in[(i+0)*n+(j+5)] * 0.02 +in[(i+1)*n+(j+0)] * 0.1 +in[(i+2)*n+(j+0)] * 0.05 - +in[(i+3)*n+(j+0)] * 0.03333333333333333 + +in[(i+3)*n+(j+0)] * 0.0333333333333 +in[(i+4)*n+(j+0)] * 0.025 +in[(i+5)*n+(j+0)] * 0.02; }); @@ -111,24 +111,24 @@ void grid2(const int n, const int t, std::vector & in, std::vector(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) { out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 - +in[(i+-2)*n+(j+-1)] * -0.020833333333333332 - +in[(i+-2)*n+(j+0)] * -0.020833333333333332 - +in[(i+-2)*n+(j+1)] * -0.020833333333333332 - +in[(i+-1)*n+(j+-2)] * -0.020833333333333332 + +in[(i+-2)*n+(j+-1)] * -0.0208333333333 + +in[(i+-2)*n+(j+0)] * -0.0208333333333 + +in[(i+-2)*n+(j+1)] * -0.0208333333333 + +in[(i+-1)*n+(j+-2)] * -0.0208333333333 +in[(i+-1)*n+(j+-1)] * -0.125 +in[(i+-1)*n+(j+0)] * -0.125 - +in[(i+-1)*n+(j+2)] * 0.020833333333333332 - +in[(i+0)*n+(j+-2)] * -0.020833333333333332 + +in[(i+-1)*n+(j+2)] * 0.0208333333333 + +in[(i+0)*n+(j+-2)] * -0.0208333333333 +in[(i+0)*n+(j+-1)] * -0.125 +in[(i+0)*n+(j+1)] * 0.125 - +in[(i+0)*n+(j+2)] * 0.020833333333333332 - +in[(i+1)*n+(j+-2)] * -0.020833333333333332 + +in[(i+0)*n+(j+2)] * 0.0208333333333 + +in[(i+1)*n+(j+-2)] * -0.0208333333333 +in[(i+1)*n+(j+0)] * 0.125 +in[(i+1)*n+(j+1)] * 0.125 - +in[(i+1)*n+(j+2)] * 0.020833333333333332 - +in[(i+2)*n+(j+-1)] * 0.020833333333333332 - +in[(i+2)*n+(j+0)] * 0.020833333333333332 - +in[(i+2)*n+(j+1)] * 0.020833333333333332 + +in[(i+1)*n+(j+2)] * 0.0208333333333 + +in[(i+2)*n+(j+-1)] * 0.0208333333333 + +in[(i+2)*n+(j+0)] * 0.0208333333333 + +in[(i+2)*n+(j+1)] * 0.0208333333333 +in[(i+2)*n+(j+2)] * 0.0625 ; }); @@ -138,48 +138,48 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.027777777777777776 - +in[(i+-3)*n+(j+-2)] * -0.005555555555555556 - +in[(i+-3)*n+(j+-1)] * -0.005555555555555556 - +in[(i+-3)*n+(j+0)] * -0.005555555555555556 - +in[(i+-3)*n+(j+1)] * -0.005555555555555556 - +in[(i+-3)*n+(j+2)] * -0.005555555555555556 - +in[(i+-2)*n+(j+-3)] * -0.005555555555555556 - +in[(i+-2)*n+(j+-2)] * -0.041666666666666664 - +in[(i+-2)*n+(j+-1)] * -0.013888888888888888 - +in[(i+-2)*n+(j+0)] * -0.013888888888888888 - +in[(i+-2)*n+(j+1)] * -0.013888888888888888 - +in[(i+-2)*n+(j+3)] * 0.005555555555555556 - +in[(i+-1)*n+(j+-3)] * -0.005555555555555556 - +in[(i+-1)*n+(j+-2)] * -0.013888888888888888 - +in[(i+-1)*n+(j+-1)] * -0.08333333333333333 - +in[(i+-1)*n+(j+0)] * -0.08333333333333333 - +in[(i+-1)*n+(j+2)] * 0.013888888888888888 - +in[(i+-1)*n+(j+3)] * 0.005555555555555556 - +in[(i+0)*n+(j+-3)] * -0.005555555555555556 - +in[(i+0)*n+(j+-2)] * -0.013888888888888888 - +in[(i+0)*n+(j+-1)] * -0.08333333333333333 - +in[(i+0)*n+(j+1)] * 0.08333333333333333 - +in[(i+0)*n+(j+2)] * 0.013888888888888888 - +in[(i+0)*n+(j+3)] * 0.005555555555555556 - +in[(i+1)*n+(j+-3)] * -0.005555555555555556 - +in[(i+1)*n+(j+-2)] * -0.013888888888888888 - +in[(i+1)*n+(j+0)] * 0.08333333333333333 - +in[(i+1)*n+(j+1)] * 0.08333333333333333 - +in[(i+1)*n+(j+2)] * 0.013888888888888888 - +in[(i+1)*n+(j+3)] * 0.005555555555555556 - +in[(i+2)*n+(j+-3)] * -0.005555555555555556 - +in[(i+2)*n+(j+-1)] * 0.013888888888888888 - +in[(i+2)*n+(j+0)] * 0.013888888888888888 - +in[(i+2)*n+(j+1)] * 0.013888888888888888 - +in[(i+2)*n+(j+2)] * 0.041666666666666664 - +in[(i+2)*n+(j+3)] * 0.005555555555555556 - +in[(i+3)*n+(j+-2)] * 0.005555555555555556 - +in[(i+3)*n+(j+-1)] * 0.005555555555555556 - +in[(i+3)*n+(j+0)] * 0.005555555555555556 - +in[(i+3)*n+(j+1)] * 0.005555555555555556 - +in[(i+3)*n+(j+2)] * 0.005555555555555556 - +in[(i+3)*n+(j+3)] * 0.027777777777777776 + out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 + +in[(i+-3)*n+(j+-2)] * -0.00555555555556 + +in[(i+-3)*n+(j+-1)] * -0.00555555555556 + +in[(i+-3)*n+(j+0)] * -0.00555555555556 + +in[(i+-3)*n+(j+1)] * -0.00555555555556 + +in[(i+-3)*n+(j+2)] * -0.00555555555556 + +in[(i+-2)*n+(j+-3)] * -0.00555555555556 + +in[(i+-2)*n+(j+-2)] * -0.0416666666667 + +in[(i+-2)*n+(j+-1)] * -0.0138888888889 + +in[(i+-2)*n+(j+0)] * -0.0138888888889 + +in[(i+-2)*n+(j+1)] * -0.0138888888889 + +in[(i+-2)*n+(j+3)] * 0.00555555555556 + +in[(i+-1)*n+(j+-3)] * -0.00555555555556 + +in[(i+-1)*n+(j+-2)] * -0.0138888888889 + +in[(i+-1)*n+(j+-1)] * -0.0833333333333 + +in[(i+-1)*n+(j+0)] * -0.0833333333333 + +in[(i+-1)*n+(j+2)] * 0.0138888888889 + +in[(i+-1)*n+(j+3)] * 0.00555555555556 + +in[(i+0)*n+(j+-3)] * -0.00555555555556 + +in[(i+0)*n+(j+-2)] * -0.0138888888889 + +in[(i+0)*n+(j+-1)] * -0.0833333333333 + +in[(i+0)*n+(j+1)] * 0.0833333333333 + +in[(i+0)*n+(j+2)] * 0.0138888888889 + +in[(i+0)*n+(j+3)] * 0.00555555555556 + +in[(i+1)*n+(j+-3)] * -0.00555555555556 + +in[(i+1)*n+(j+-2)] * -0.0138888888889 + +in[(i+1)*n+(j+0)] * 0.0833333333333 + +in[(i+1)*n+(j+1)] * 0.0833333333333 + +in[(i+1)*n+(j+2)] * 0.0138888888889 + +in[(i+1)*n+(j+3)] * 0.00555555555556 + +in[(i+2)*n+(j+-3)] * -0.00555555555556 + +in[(i+2)*n+(j+-1)] * 0.0138888888889 + +in[(i+2)*n+(j+0)] * 0.0138888888889 + +in[(i+2)*n+(j+1)] * 0.0138888888889 + +in[(i+2)*n+(j+2)] * 0.0416666666667 + +in[(i+2)*n+(j+3)] * 0.00555555555556 + +in[(i+3)*n+(j+-2)] * 0.00555555555556 + +in[(i+3)*n+(j+-1)] * 0.00555555555556 + +in[(i+3)*n+(j+0)] * 0.00555555555556 + +in[(i+3)*n+(j+1)] * 0.00555555555556 + +in[(i+3)*n+(j+2)] * 0.00555555555556 + +in[(i+3)*n+(j+3)] * 0.0277777777778 ; }); }); @@ -189,76 +189,76 @@ void grid4(const int n, const int t, std::vector & in, std::vector(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 - +in[(i+-4)*n+(j+-3)] * -0.002232142857142857 - +in[(i+-4)*n+(j+-2)] * -0.002232142857142857 - +in[(i+-4)*n+(j+-1)] * -0.002232142857142857 - +in[(i+-4)*n+(j+0)] * -0.002232142857142857 - +in[(i+-4)*n+(j+1)] * -0.002232142857142857 - +in[(i+-4)*n+(j+2)] * -0.002232142857142857 - +in[(i+-4)*n+(j+3)] * -0.002232142857142857 - +in[(i+-3)*n+(j+-4)] * -0.002232142857142857 - +in[(i+-3)*n+(j+-3)] * -0.020833333333333332 - +in[(i+-3)*n+(j+-2)] * -0.004166666666666667 - +in[(i+-3)*n+(j+-1)] * -0.004166666666666667 - +in[(i+-3)*n+(j+0)] * -0.004166666666666667 - +in[(i+-3)*n+(j+1)] * -0.004166666666666667 - +in[(i+-3)*n+(j+2)] * -0.004166666666666667 - +in[(i+-3)*n+(j+4)] * 0.002232142857142857 - +in[(i+-2)*n+(j+-4)] * -0.002232142857142857 - +in[(i+-2)*n+(j+-3)] * -0.004166666666666667 + +in[(i+-4)*n+(j+-3)] * -0.00223214285714 + +in[(i+-4)*n+(j+-2)] * -0.00223214285714 + +in[(i+-4)*n+(j+-1)] * -0.00223214285714 + +in[(i+-4)*n+(j+0)] * -0.00223214285714 + +in[(i+-4)*n+(j+1)] * -0.00223214285714 + +in[(i+-4)*n+(j+2)] * -0.00223214285714 + +in[(i+-4)*n+(j+3)] * -0.00223214285714 + +in[(i+-3)*n+(j+-4)] * -0.00223214285714 + +in[(i+-3)*n+(j+-3)] * -0.0208333333333 + +in[(i+-3)*n+(j+-2)] * -0.00416666666667 + +in[(i+-3)*n+(j+-1)] * -0.00416666666667 + +in[(i+-3)*n+(j+0)] * -0.00416666666667 + +in[(i+-3)*n+(j+1)] * -0.00416666666667 + +in[(i+-3)*n+(j+2)] * -0.00416666666667 + +in[(i+-3)*n+(j+4)] * 0.00223214285714 + +in[(i+-2)*n+(j+-4)] * -0.00223214285714 + +in[(i+-2)*n+(j+-3)] * -0.00416666666667 +in[(i+-2)*n+(j+-2)] * -0.03125 - +in[(i+-2)*n+(j+-1)] * -0.010416666666666666 - +in[(i+-2)*n+(j+0)] * -0.010416666666666666 - +in[(i+-2)*n+(j+1)] * -0.010416666666666666 - +in[(i+-2)*n+(j+3)] * 0.004166666666666667 - +in[(i+-2)*n+(j+4)] * 0.002232142857142857 - +in[(i+-1)*n+(j+-4)] * -0.002232142857142857 - +in[(i+-1)*n+(j+-3)] * -0.004166666666666667 - +in[(i+-1)*n+(j+-2)] * -0.010416666666666666 + +in[(i+-2)*n+(j+-1)] * -0.0104166666667 + +in[(i+-2)*n+(j+0)] * -0.0104166666667 + +in[(i+-2)*n+(j+1)] * -0.0104166666667 + +in[(i+-2)*n+(j+3)] * 0.00416666666667 + +in[(i+-2)*n+(j+4)] * 0.00223214285714 + +in[(i+-1)*n+(j+-4)] * -0.00223214285714 + +in[(i+-1)*n+(j+-3)] * -0.00416666666667 + +in[(i+-1)*n+(j+-2)] * -0.0104166666667 +in[(i+-1)*n+(j+-1)] * -0.0625 +in[(i+-1)*n+(j+0)] * -0.0625 - +in[(i+-1)*n+(j+2)] * 0.010416666666666666 - +in[(i+-1)*n+(j+3)] * 0.004166666666666667 - +in[(i+-1)*n+(j+4)] * 0.002232142857142857 - +in[(i+0)*n+(j+-4)] * -0.002232142857142857 - +in[(i+0)*n+(j+-3)] * -0.004166666666666667 - +in[(i+0)*n+(j+-2)] * -0.010416666666666666 + +in[(i+-1)*n+(j+2)] * 0.0104166666667 + +in[(i+-1)*n+(j+3)] * 0.00416666666667 + +in[(i+-1)*n+(j+4)] * 0.00223214285714 + +in[(i+0)*n+(j+-4)] * -0.00223214285714 + +in[(i+0)*n+(j+-3)] * -0.00416666666667 + +in[(i+0)*n+(j+-2)] * -0.0104166666667 +in[(i+0)*n+(j+-1)] * -0.0625 +in[(i+0)*n+(j+1)] * 0.0625 - +in[(i+0)*n+(j+2)] * 0.010416666666666666 - +in[(i+0)*n+(j+3)] * 0.004166666666666667 - +in[(i+0)*n+(j+4)] * 0.002232142857142857 - +in[(i+1)*n+(j+-4)] * -0.002232142857142857 - +in[(i+1)*n+(j+-3)] * -0.004166666666666667 - +in[(i+1)*n+(j+-2)] * -0.010416666666666666 + +in[(i+0)*n+(j+2)] * 0.0104166666667 + +in[(i+0)*n+(j+3)] * 0.00416666666667 + +in[(i+0)*n+(j+4)] * 0.00223214285714 + +in[(i+1)*n+(j+-4)] * -0.00223214285714 + +in[(i+1)*n+(j+-3)] * -0.00416666666667 + +in[(i+1)*n+(j+-2)] * -0.0104166666667 +in[(i+1)*n+(j+0)] * 0.0625 +in[(i+1)*n+(j+1)] * 0.0625 - +in[(i+1)*n+(j+2)] * 0.010416666666666666 - +in[(i+1)*n+(j+3)] * 0.004166666666666667 - +in[(i+1)*n+(j+4)] * 0.002232142857142857 - +in[(i+2)*n+(j+-4)] * -0.002232142857142857 - +in[(i+2)*n+(j+-3)] * -0.004166666666666667 - +in[(i+2)*n+(j+-1)] * 0.010416666666666666 - +in[(i+2)*n+(j+0)] * 0.010416666666666666 - +in[(i+2)*n+(j+1)] * 0.010416666666666666 + +in[(i+1)*n+(j+2)] * 0.0104166666667 + +in[(i+1)*n+(j+3)] * 0.00416666666667 + +in[(i+1)*n+(j+4)] * 0.00223214285714 + +in[(i+2)*n+(j+-4)] * -0.00223214285714 + +in[(i+2)*n+(j+-3)] * -0.00416666666667 + +in[(i+2)*n+(j+-1)] * 0.0104166666667 + +in[(i+2)*n+(j+0)] * 0.0104166666667 + +in[(i+2)*n+(j+1)] * 0.0104166666667 +in[(i+2)*n+(j+2)] * 0.03125 - +in[(i+2)*n+(j+3)] * 0.004166666666666667 - +in[(i+2)*n+(j+4)] * 0.002232142857142857 - +in[(i+3)*n+(j+-4)] * -0.002232142857142857 - +in[(i+3)*n+(j+-2)] * 0.004166666666666667 - +in[(i+3)*n+(j+-1)] * 0.004166666666666667 - +in[(i+3)*n+(j+0)] * 0.004166666666666667 - +in[(i+3)*n+(j+1)] * 0.004166666666666667 - +in[(i+3)*n+(j+2)] * 0.004166666666666667 - +in[(i+3)*n+(j+3)] * 0.020833333333333332 - +in[(i+3)*n+(j+4)] * 0.002232142857142857 - +in[(i+4)*n+(j+-3)] * 0.002232142857142857 - +in[(i+4)*n+(j+-2)] * 0.002232142857142857 - +in[(i+4)*n+(j+-1)] * 0.002232142857142857 - +in[(i+4)*n+(j+0)] * 0.002232142857142857 - +in[(i+4)*n+(j+1)] * 0.002232142857142857 - +in[(i+4)*n+(j+2)] * 0.002232142857142857 - +in[(i+4)*n+(j+3)] * 0.002232142857142857 + +in[(i+2)*n+(j+3)] * 0.00416666666667 + +in[(i+2)*n+(j+4)] * 0.00223214285714 + +in[(i+3)*n+(j+-4)] * -0.00223214285714 + +in[(i+3)*n+(j+-2)] * 0.00416666666667 + +in[(i+3)*n+(j+-1)] * 0.00416666666667 + +in[(i+3)*n+(j+0)] * 0.00416666666667 + +in[(i+3)*n+(j+1)] * 0.00416666666667 + +in[(i+3)*n+(j+2)] * 0.00416666666667 + +in[(i+3)*n+(j+3)] * 0.0208333333333 + +in[(i+3)*n+(j+4)] * 0.00223214285714 + +in[(i+4)*n+(j+-3)] * 0.00223214285714 + +in[(i+4)*n+(j+-2)] * 0.00223214285714 + +in[(i+4)*n+(j+-1)] * 0.00223214285714 + +in[(i+4)*n+(j+0)] * 0.00223214285714 + +in[(i+4)*n+(j+1)] * 0.00223214285714 + +in[(i+4)*n+(j+2)] * 0.00223214285714 + +in[(i+4)*n+(j+3)] * 0.00223214285714 +in[(i+4)*n+(j+4)] * 0.015625 ; }); @@ -269,114 +269,114 @@ void grid5(const int n, const int t, std::vector & in, std::vector(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 - +in[(i+-5)*n+(j+-4)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+-3)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+-2)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+-1)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+0)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+1)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+2)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+3)] * -0.0011111111111111111 - +in[(i+-5)*n+(j+4)] * -0.0011111111111111111 - +in[(i+-4)*n+(j+-5)] * -0.0011111111111111111 + +in[(i+-5)*n+(j+-4)] * -0.00111111111111 + +in[(i+-5)*n+(j+-3)] * -0.00111111111111 + +in[(i+-5)*n+(j+-2)] * -0.00111111111111 + +in[(i+-5)*n+(j+-1)] * -0.00111111111111 + +in[(i+-5)*n+(j+0)] * -0.00111111111111 + +in[(i+-5)*n+(j+1)] * -0.00111111111111 + +in[(i+-5)*n+(j+2)] * -0.00111111111111 + +in[(i+-5)*n+(j+3)] * -0.00111111111111 + +in[(i+-5)*n+(j+4)] * -0.00111111111111 + +in[(i+-4)*n+(j+-5)] * -0.00111111111111 +in[(i+-4)*n+(j+-4)] * -0.0125 - +in[(i+-4)*n+(j+-3)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+-2)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+-1)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+0)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+1)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+2)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+3)] * -0.0017857142857142857 - +in[(i+-4)*n+(j+5)] * 0.0011111111111111111 - +in[(i+-3)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+-3)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+-3)*n+(j+-3)] * -0.016666666666666666 - +in[(i+-3)*n+(j+-2)] * -0.0033333333333333335 - +in[(i+-3)*n+(j+-1)] * -0.0033333333333333335 - +in[(i+-3)*n+(j+0)] * -0.0033333333333333335 - +in[(i+-3)*n+(j+1)] * -0.0033333333333333335 - +in[(i+-3)*n+(j+2)] * -0.0033333333333333335 - +in[(i+-3)*n+(j+4)] * 0.0017857142857142857 - +in[(i+-3)*n+(j+5)] * 0.0011111111111111111 - +in[(i+-2)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+-2)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+-2)*n+(j+-3)] * -0.0033333333333333335 + +in[(i+-4)*n+(j+-3)] * -0.00178571428571 + +in[(i+-4)*n+(j+-2)] * -0.00178571428571 + +in[(i+-4)*n+(j+-1)] * -0.00178571428571 + +in[(i+-4)*n+(j+0)] * -0.00178571428571 + +in[(i+-4)*n+(j+1)] * -0.00178571428571 + +in[(i+-4)*n+(j+2)] * -0.00178571428571 + +in[(i+-4)*n+(j+3)] * -0.00178571428571 + +in[(i+-4)*n+(j+5)] * 0.00111111111111 + +in[(i+-3)*n+(j+-5)] * -0.00111111111111 + +in[(i+-3)*n+(j+-4)] * -0.00178571428571 + +in[(i+-3)*n+(j+-3)] * -0.0166666666667 + +in[(i+-3)*n+(j+-2)] * -0.00333333333333 + +in[(i+-3)*n+(j+-1)] * -0.00333333333333 + +in[(i+-3)*n+(j+0)] * -0.00333333333333 + +in[(i+-3)*n+(j+1)] * -0.00333333333333 + +in[(i+-3)*n+(j+2)] * -0.00333333333333 + +in[(i+-3)*n+(j+4)] * 0.00178571428571 + +in[(i+-3)*n+(j+5)] * 0.00111111111111 + +in[(i+-2)*n+(j+-5)] * -0.00111111111111 + +in[(i+-2)*n+(j+-4)] * -0.00178571428571 + +in[(i+-2)*n+(j+-3)] * -0.00333333333333 +in[(i+-2)*n+(j+-2)] * -0.025 - +in[(i+-2)*n+(j+-1)] * -0.008333333333333333 - +in[(i+-2)*n+(j+0)] * -0.008333333333333333 - +in[(i+-2)*n+(j+1)] * -0.008333333333333333 - +in[(i+-2)*n+(j+3)] * 0.0033333333333333335 - +in[(i+-2)*n+(j+4)] * 0.0017857142857142857 - +in[(i+-2)*n+(j+5)] * 0.0011111111111111111 - +in[(i+-1)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+-1)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+-1)*n+(j+-3)] * -0.0033333333333333335 - +in[(i+-1)*n+(j+-2)] * -0.008333333333333333 + +in[(i+-2)*n+(j+-1)] * -0.00833333333333 + +in[(i+-2)*n+(j+0)] * -0.00833333333333 + +in[(i+-2)*n+(j+1)] * -0.00833333333333 + +in[(i+-2)*n+(j+3)] * 0.00333333333333 + +in[(i+-2)*n+(j+4)] * 0.00178571428571 + +in[(i+-2)*n+(j+5)] * 0.00111111111111 + +in[(i+-1)*n+(j+-5)] * -0.00111111111111 + +in[(i+-1)*n+(j+-4)] * -0.00178571428571 + +in[(i+-1)*n+(j+-3)] * -0.00333333333333 + +in[(i+-1)*n+(j+-2)] * -0.00833333333333 +in[(i+-1)*n+(j+-1)] * -0.05 +in[(i+-1)*n+(j+0)] * -0.05 - +in[(i+-1)*n+(j+2)] * 0.008333333333333333 - +in[(i+-1)*n+(j+3)] * 0.0033333333333333335 - +in[(i+-1)*n+(j+4)] * 0.0017857142857142857 - +in[(i+-1)*n+(j+5)] * 0.0011111111111111111 - +in[(i+0)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+0)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+0)*n+(j+-3)] * -0.0033333333333333335 - +in[(i+0)*n+(j+-2)] * -0.008333333333333333 + +in[(i+-1)*n+(j+2)] * 0.00833333333333 + +in[(i+-1)*n+(j+3)] * 0.00333333333333 + +in[(i+-1)*n+(j+4)] * 0.00178571428571 + +in[(i+-1)*n+(j+5)] * 0.00111111111111 + +in[(i+0)*n+(j+-5)] * -0.00111111111111 + +in[(i+0)*n+(j+-4)] * -0.00178571428571 + +in[(i+0)*n+(j+-3)] * -0.00333333333333 + +in[(i+0)*n+(j+-2)] * -0.00833333333333 +in[(i+0)*n+(j+-1)] * -0.05 +in[(i+0)*n+(j+1)] * 0.05 - +in[(i+0)*n+(j+2)] * 0.008333333333333333 - +in[(i+0)*n+(j+3)] * 0.0033333333333333335 - +in[(i+0)*n+(j+4)] * 0.0017857142857142857 - +in[(i+0)*n+(j+5)] * 0.0011111111111111111 - +in[(i+1)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+1)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+1)*n+(j+-3)] * -0.0033333333333333335 - +in[(i+1)*n+(j+-2)] * -0.008333333333333333 + +in[(i+0)*n+(j+2)] * 0.00833333333333 + +in[(i+0)*n+(j+3)] * 0.00333333333333 + +in[(i+0)*n+(j+4)] * 0.00178571428571 + +in[(i+0)*n+(j+5)] * 0.00111111111111 + +in[(i+1)*n+(j+-5)] * -0.00111111111111 + +in[(i+1)*n+(j+-4)] * -0.00178571428571 + +in[(i+1)*n+(j+-3)] * -0.00333333333333 + +in[(i+1)*n+(j+-2)] * -0.00833333333333 +in[(i+1)*n+(j+0)] * 0.05 +in[(i+1)*n+(j+1)] * 0.05 - +in[(i+1)*n+(j+2)] * 0.008333333333333333 - +in[(i+1)*n+(j+3)] * 0.0033333333333333335 - +in[(i+1)*n+(j+4)] * 0.0017857142857142857 - +in[(i+1)*n+(j+5)] * 0.0011111111111111111 - +in[(i+2)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+2)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+2)*n+(j+-3)] * -0.0033333333333333335 - +in[(i+2)*n+(j+-1)] * 0.008333333333333333 - +in[(i+2)*n+(j+0)] * 0.008333333333333333 - +in[(i+2)*n+(j+1)] * 0.008333333333333333 + +in[(i+1)*n+(j+2)] * 0.00833333333333 + +in[(i+1)*n+(j+3)] * 0.00333333333333 + +in[(i+1)*n+(j+4)] * 0.00178571428571 + +in[(i+1)*n+(j+5)] * 0.00111111111111 + +in[(i+2)*n+(j+-5)] * -0.00111111111111 + +in[(i+2)*n+(j+-4)] * -0.00178571428571 + +in[(i+2)*n+(j+-3)] * -0.00333333333333 + +in[(i+2)*n+(j+-1)] * 0.00833333333333 + +in[(i+2)*n+(j+0)] * 0.00833333333333 + +in[(i+2)*n+(j+1)] * 0.00833333333333 +in[(i+2)*n+(j+2)] * 0.025 - +in[(i+2)*n+(j+3)] * 0.0033333333333333335 - +in[(i+2)*n+(j+4)] * 0.0017857142857142857 - +in[(i+2)*n+(j+5)] * 0.0011111111111111111 - +in[(i+3)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+3)*n+(j+-4)] * -0.0017857142857142857 - +in[(i+3)*n+(j+-2)] * 0.0033333333333333335 - +in[(i+3)*n+(j+-1)] * 0.0033333333333333335 - +in[(i+3)*n+(j+0)] * 0.0033333333333333335 - +in[(i+3)*n+(j+1)] * 0.0033333333333333335 - +in[(i+3)*n+(j+2)] * 0.0033333333333333335 - +in[(i+3)*n+(j+3)] * 0.016666666666666666 - +in[(i+3)*n+(j+4)] * 0.0017857142857142857 - +in[(i+3)*n+(j+5)] * 0.0011111111111111111 - +in[(i+4)*n+(j+-5)] * -0.0011111111111111111 - +in[(i+4)*n+(j+-3)] * 0.0017857142857142857 - +in[(i+4)*n+(j+-2)] * 0.0017857142857142857 - +in[(i+4)*n+(j+-1)] * 0.0017857142857142857 - +in[(i+4)*n+(j+0)] * 0.0017857142857142857 - +in[(i+4)*n+(j+1)] * 0.0017857142857142857 - +in[(i+4)*n+(j+2)] * 0.0017857142857142857 - +in[(i+4)*n+(j+3)] * 0.0017857142857142857 + +in[(i+2)*n+(j+3)] * 0.00333333333333 + +in[(i+2)*n+(j+4)] * 0.00178571428571 + +in[(i+2)*n+(j+5)] * 0.00111111111111 + +in[(i+3)*n+(j+-5)] * -0.00111111111111 + +in[(i+3)*n+(j+-4)] * -0.00178571428571 + +in[(i+3)*n+(j+-2)] * 0.00333333333333 + +in[(i+3)*n+(j+-1)] * 0.00333333333333 + +in[(i+3)*n+(j+0)] * 0.00333333333333 + +in[(i+3)*n+(j+1)] * 0.00333333333333 + +in[(i+3)*n+(j+2)] * 0.00333333333333 + +in[(i+3)*n+(j+3)] * 0.0166666666667 + +in[(i+3)*n+(j+4)] * 0.00178571428571 + +in[(i+3)*n+(j+5)] * 0.00111111111111 + +in[(i+4)*n+(j+-5)] * -0.00111111111111 + +in[(i+4)*n+(j+-3)] * 0.00178571428571 + +in[(i+4)*n+(j+-2)] * 0.00178571428571 + +in[(i+4)*n+(j+-1)] * 0.00178571428571 + +in[(i+4)*n+(j+0)] * 0.00178571428571 + +in[(i+4)*n+(j+1)] * 0.00178571428571 + +in[(i+4)*n+(j+2)] * 0.00178571428571 + +in[(i+4)*n+(j+3)] * 0.00178571428571 +in[(i+4)*n+(j+4)] * 0.0125 - +in[(i+4)*n+(j+5)] * 0.0011111111111111111 - +in[(i+5)*n+(j+-4)] * 0.0011111111111111111 - +in[(i+5)*n+(j+-3)] * 0.0011111111111111111 - +in[(i+5)*n+(j+-2)] * 0.0011111111111111111 - +in[(i+5)*n+(j+-1)] * 0.0011111111111111111 - +in[(i+5)*n+(j+0)] * 0.0011111111111111111 - +in[(i+5)*n+(j+1)] * 0.0011111111111111111 - +in[(i+5)*n+(j+2)] * 0.0011111111111111111 - +in[(i+5)*n+(j+3)] * 0.0011111111111111111 - +in[(i+5)*n+(j+4)] * 0.0011111111111111111 + +in[(i+4)*n+(j+5)] * 0.00111111111111 + +in[(i+5)*n+(j+-4)] * 0.00111111111111 + +in[(i+5)*n+(j+-3)] * 0.00111111111111 + +in[(i+5)*n+(j+-2)] * 0.00111111111111 + +in[(i+5)*n+(j+-1)] * 0.00111111111111 + +in[(i+5)*n+(j+0)] * 0.00111111111111 + +in[(i+5)*n+(j+1)] * 0.00111111111111 + +in[(i+5)*n+(j+2)] * 0.00111111111111 + +in[(i+5)*n+(j+3)] * 0.00111111111111 + +in[(i+5)*n+(j+4)] * 0.00111111111111 +in[(i+5)*n+(j+5)] * 0.01 ; }); diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp index e9580e1fa..b6bf57581 100644 --- a/Cxx11/stencil_rangefor.hpp +++ b/Cxx11/stencil_rangefor.hpp @@ -37,18 +37,18 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector matrix; + // column-major 2D array + //typedef Kokkos::View matrix; + // default 2D array + //typedef Kokkos::View matrix; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + bool permute = false; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + + auto permute_input = (argc>4) ? std::atoi(argv[4]) : 0; + if (permute_input != 0 && permute_input != 1) { + throw "ERROR: permute must be 0 (no) or 1 (yes)"; + } + permute = (permute_input == 1); + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } - typedef Kokkos::TeamPolicy<> team_policy ; - typedef Kokkos::TeamPolicy<>::member_type member_type ; - - // row-major 2D array - typedef Kokkos::View matrix; - // column-major 2D array - //typedef Kokkos::View matrix; - // default 2D array - //typedef Kokkos::View matrix; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations; - int order; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - // number of times to do the transpose - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Permute loops = " << (permute ? "yes" : "no") << std::endl; + std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; - // order of a the matrix - order = std::atoi(argv[2]); - if (order <= 0) { - throw "ERROR: Matrix Order must be greater than 0"; - } else if (order > std::floor(std::sqrt(INT_MAX))) { - throw "ERROR: matrix dimension too large - overflow risk"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + matrix A("A", order, order); + matrix B("B", order, order); - ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix - ////////////////////////////////////////////////////////////////////// + auto order2 = {order,order}; + auto tile2 = {tile_size,tile_size}; - matrix A("A", order, order); - matrix B("B", order, order); + auto policy = Kokkos::MDRangePolicy>({0,0},order2,tile2); + typedef Kokkos::Rank<2,Kokkos::Iterate::Right,Kokkos::Iterate::Left > rl; + typedef Kokkos::Rank<2,Kokkos::Iterate::Left, Kokkos::Iterate::Right> lr; + auto policy_lr = Kokkos::MDRangePolicy({0,0},order2,tile2); + auto policy_rl = Kokkos::MDRangePolicy({0,0},order2,tile2); -#if 0 - Kokkos::parallel_for ( order, KOKKOS_LAMBDA(const int i) { - for (auto j=0; j(i*order+j); B(i,j) = 0.0; - } - }); -#else - Kokkos::parallel_for( team_policy(order, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) { - const int i = teamMember.league_rank(); - Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, order), [&](const int j) { - A(i,j) = static_cast(i*order+j); - B(i,j) = 0.0; }); - }); -#endif - auto trans_time = 0.0; + double trans_time(0); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; ++iter) { - if (iter==1) trans_time = prk::wtime(); + if (iter==1) trans_time = prk::wtime(); -#if 0 - Kokkos::parallel_for ( order, KOKKOS_LAMBDA(const int i) { - for (auto j=0; j(ij)*(1.+iterations)+addit; - inner += std::fabs(B(j,i) - reference); - }, temp); - Kokkos::single( Kokkos::PerTeam( teamMember ), [&] () { - update += temp; - }); - }, abserr); + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double const addit = (iterations+1.) * (0.5*iterations); + double abserr(0); + Kokkos::parallel_reduce(policy, KOKKOS_LAMBDA(int i, int j, double & update) { + size_t const ij = i*order+j; + double const reference = static_cast(ij)*(1.+iterations)+addit; + update += std::fabs(B(j,i) - reference); + }, abserr); #ifdef VERBOSE - std::cout << "Sum of absolute differences: " << abserr << std::endl; + std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - const auto epsilon = 1.0e-8; - if (abserr < epsilon) { - std::cout << "Solution validates" << std::endl; - auto avgtime = trans_time/iterations; - auto bytes = (size_t)order * (size_t)order * sizeof(double); - std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime - << " Avg time (s): " << avgtime << std::endl; - } else { - std::cout << "ERROR: Aggregate squared error " << abserr - << " exceeds threshold " << epsilon << std::endl; - return 1; - } + double epsilon(1.0e-8); + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + } Kokkos::finalize(); return 0; From 3dde8a27e10f554a1c56ccf197dcb4f66a1b4b49 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Apr 2018 19:49:37 -0400 Subject: [PATCH 079/245] Sycl remove boost (#335) * remove boost dependency from sycl nstream * remove boost dependency from sycl transpose * cleanup sycl like kokkos * bug fix range replacement * initialize simply to avoid issues --- Cxx11/Makefile | 2 +- Cxx11/nstream-sycl.cc | 24 ++++++++---------------- Cxx11/stencil-sycl.cc | 2 -- Cxx11/transpose-sycl.cc | 26 +++++++++++--------------- travis/install-raja.sh | 3 ++- 5 files changed, 22 insertions(+), 35 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 5eb4b1526..c4b9b6ba8 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -35,7 +35,7 @@ TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS -SYCLFLAGS = $(SYCLFLAG) $(BOOSTFLAG) +SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index b21c73593..c5d390341 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -111,21 +111,13 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto nstream_time = 0.0; + double nstream_time(0); - std::vector h_A(length); - std::vector h_B(length); - std::vector h_C(length); + std::vector h_A(length,0); + std::vector h_B(length,2); + std::vector h_C(length,2); - auto range = boost::irange(static_cast(0), length); - - const double scalar(3); - - std::for_each( std::begin(range), std::end(range), [&] (size_t i) { - h_A[i] = 0; - h_B[i] = 2; - h_C[i] = 2; - }); + double const scalar(3); { // initialize device buffers from host buffers @@ -133,7 +125,7 @@ int main(int argc, char * argv[]) cl::sycl::buffer d_B { h_B.data(), h_B.size() }; cl::sycl::buffer d_C { h_C.data(), h_C.size() }; - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; ++iter) { if (iter==1) nstream_time = prk::wtime(); @@ -164,14 +156,14 @@ int main(int argc, char * argv[]) double ar(0); double br(2); double cr(2); - for (auto i=0; i<=iterations; i++) { + for (int i=0; i<=iterations; ++i) { ar += br + scalar * cr; } ar *= length; double asum(0); - for (size_t i=0; i h_A(order*order); std::vector h_B(order*order,0.0); @@ -115,7 +113,7 @@ int main(int argc, char * argv[]) cl::sycl::buffer d_B { h_B.data(), h_B.size() }; #endif - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; ++iter) { if (iter==1) trans_time = prk::wtime(); @@ -151,16 +149,14 @@ int main(int argc, char * argv[]) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - auto range = boost::irange(static_cast(0),order); - // TODO: replace with std::generate, std::accumulate, or similar - const auto addit = (iterations+1.) * (iterations/2.); - auto abserr = 0.0; - for (auto i : range) { - for (auto j : range) { - const int ij = i*order+j; - const int ji = j*order+i; - const double reference = static_cast(ij)*(1.+iterations)+addit; + double const addit = (iterations+1.) * (iterations/2.); + double abserr(0); + for (size_t i=0; i(ij)*(1.+iterations)+addit; abserr += std::fabs(h_B[ji] - reference); } } @@ -169,12 +165,12 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - const auto epsilon = 1.0e-8; + double const epsilon(1.0e-8); if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; auto avgtime = trans_time/iterations; auto bytes = (size_t)order * (size_t)order * sizeof(double); - std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime << " Avg time (s): " << avgtime << std::endl; } else { std::cout << "ERROR: Aggregate squared error " << abserr diff --git a/travis/install-raja.sh b/travis/install-raja.sh index 114b9f2a5..fe633f5aa 100644 --- a/travis/install-raja.sh +++ b/travis/install-raja.sh @@ -40,7 +40,8 @@ esac ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/raja" ]; then - BRANCH=develop + #BRANCH=develop # forallN deprecated + BRANCH=master git clone --recursive --depth 1 -b ${BRANCH} https://github.com/LLNL/RAJA.git cd RAJA mkdir build From bb3afd897e704be1277b988f5c4bf6e952c21af3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 25 Apr 2018 19:49:53 -0400 Subject: [PATCH 080/245] use RAJA master branch (#336) - they have deprecated forallN in develop branch and i don't want to see the warnings about it - somebody committed a "here 2" debug message to develop branch that leads to excessive output in Travis From 9d5780cd224398f639013c4df9d9dacd71047480 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 27 Apr 2018 12:15:56 -0700 Subject: [PATCH 081/245] Use ranges ts (#337) * add Travis installer for Ranges TS * update makefile examples for ranges TS change * hide difference between Boost and TS ranges with prk::range in prk_util.h * finish ranges update: - replace boost::irange with prk::range everywhere - prk::range supports both contiguous and strided --- Cxx11/Makefile | 10 +++-- Cxx11/generate-cxx-stencil.py | 8 ++-- Cxx11/nstream-vector-pstl.cc | 2 +- Cxx11/nstream-vector-rangefor.cc | 2 +- Cxx11/p2p-hyperplane-vector-pstl.cc | 4 +- Cxx11/prk_util.h | 37 ++++++++++++++-- Cxx11/stencil-vector-pstl.cc | 4 +- Cxx11/stencil-vector-rangefor.cc | 4 +- Cxx11/stencil_pgnu.hpp | 20 ++++----- Cxx11/stencil_pstl.hpp | 20 ++++----- Cxx11/stencil_rangefor.hpp | 20 ++++----- Cxx11/stencil_stl.hpp | 20 ++++----- Cxx11/transpose-vector-pstl.cc | 2 +- Cxx11/transpose-vector-rangefor.cc | 16 +++---- common/make.defs.gcc | 20 +++++---- common/make.defs.intel | 14 ++++--- common/make.defs.llvm | 13 +++--- travis/build-run-prk.sh | 65 ++++++++++++++--------------- travis/install-deps.sh | 1 + travis/install-ranges.sh | 8 ++++ 20 files changed, 169 insertions(+), 121 deletions(-) create mode 100644 travis/install-ranges.sh diff --git a/Cxx11/Makefile b/Cxx11/Makefile index c4b9b6ba8..3a26ead0b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -28,7 +28,8 @@ ifdef USE_PRK_KOKKOS_BACKEND KOKKOS_BACKEND_FLAG = -DPRK_KOKKOS_BACKEND=$(USE_PRK_KOKKOS_BACKEND) endif -ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm +#ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm +ASMFLAGS = -fverbose-asm OMPFLAGS = $(OPENMPFLAG) TARGETFLAGS = $(OFFLOADFLAG) @@ -40,8 +41,9 @@ ORNLACCFLAGS = $(ORNLACCFLAG) TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) BOOSTFLAGS = $(BOOSTFLAG) -STLFLAGS = $(STLFLAG) $(BOOSTFLAGS) -PSTLFLAGS = $(PSTLFLAG) $(BOOSTFLAGS) +RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG) +STLFLAGS = $(STLFLAG) $(RANGEFLAGS) +PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) RAJAFLAGS = $(RAJAFLAG) KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) ORNLACCFLAGS = $(ORNLACCFLAG) @@ -159,7 +161,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(PSTLFLAGS) -o $@ %-rangefor: %-rangefor.cc prk_util.h - $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -o $@ + $(CXX) $(CXXFLAGS) $< $(RANGEFLAGS) -o $@ %-boost-compute: %-boost-compute.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@ diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 39e66459a..b3b573887 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -29,24 +29,24 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' for (auto j='+str(radius)+'; j & in, std::vector & out) {\n') - src.write(' auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' for (auto i : inside) {\n') src.write(' PRAGMA_SIMD\n') src.write(' for (auto j : inside) {\n') elif (model=='stl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') - src.write(' auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n') #src.write(' PRAGMA_SIMD\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n') elif (model=='pgnu'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') - src.write(' auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n') elif (model=='pstl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') - src.write(' auto inside = boost::irange('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {\n') src.write(' std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n') elif (model=='raja'): diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index dfe053ebf..dbc52aaf4 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -120,7 +120,7 @@ int main(int argc, char * argv[]) std::vector B(length); std::vector C(length); - auto range = boost::irange(static_cast(0), length); + auto range = prk::range(static_cast(0), length); double scalar(3); diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc index 54bad9274..2bdadea3d 100644 --- a/Cxx11/nstream-vector-rangefor.cc +++ b/Cxx11/nstream-vector-rangefor.cc @@ -116,7 +116,7 @@ int main(int argc, char * argv[]) std::vector B(length,2.0); std::vector C(length,2.0); - auto range = boost::irange(static_cast(0), length); + auto range = prk::range(0,length); double scalar(3); diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc index 81b58d50c..91b0392f0 100644 --- a/Cxx11/p2p-hyperplane-vector-pstl.cc +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -161,7 +161,7 @@ int main(int argc, char* argv[]) for (auto i=2; i<=2*n-2; i++) { const auto begin = std::max(2,i-n+2); const auto end = std::min(i,n)+1; - auto range = boost::irange(begin,end); + auto range = prk::range(begin,end); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ @@ -179,7 +179,7 @@ int main(int argc, char* argv[]) for (int i=2; i<=2*(nb+1)-2; i++) { const auto begin = std::max(2,i-(nb+1)+2); const auto end = std::min(i,nb+1)+1; - auto range = boost::irange(begin,end); + auto range = prk::range(begin,end); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 8bb718fe0..e1576e3a1 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -1,5 +1,5 @@ /// -/// Copyright (c) 2013, Intel Corporation +/// Copyright (c) 2018, Intel Corporation /// /// Redistribution and use in source and binary forms, with or without /// modification, are permitted provided that the following conditions @@ -180,8 +180,16 @@ const T prk_reduce(I first, I last, T init) { # endif #endif -#if defined(USE_BOOST) -# include "boost/range/irange.hpp" +#if defined(USE_RANGES) +# if defined(USE_BOOST_IRANGE) +# include "boost/range/irange.hpp" +# elif defined(USE_RANGES_TS) +# include "range/v3/view/iota.hpp" +# include "range/v3/view/slice.hpp" +# include "range/v3/view/stride.hpp" +# else +# error You have not provided a version of ranges to use. +# endif #endif #if defined(USE_BOOST_COMPUTE) @@ -248,6 +256,29 @@ namespace prk { return ( numerator / denominator + (numerator % denominator > 0) ); } + template + auto range(S start, E end) { +#if defined(USE_BOOST_IRANGE) + return boost::irange(static_cast(start), end); +#elif defined(USE_RANGES_TS) + return ranges::view::iota(static_cast(start), end); +#endif + } + + template + auto range(S start, E end, B blocking) { +#if defined(USE_BOOST_IRANGE) + return boost::irange(static_cast(start), end, decltype(end)>(blocking) ); +#elif defined(USE_RANGES_TS) + // NOTE: + // iota(s) | slice(s,e) | stride(b) is faster than + // iota(s,e) | stride(b) for some reason. + return ranges::view::iota(static_cast(start)) | + ranges::view::slice(static_cast(start), end) | + ranges::view::stride(static_cast(blocking)); +#endif + } + } // namespace prk #endif /* PRK_UTIL_H */ diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index 863a50df5..8495032ca 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -180,7 +180,7 @@ int main(int argc, char* argv[]) std::vector out(n*n); // initialize the input and output arrays - auto range = boost::irange(0,n); + auto range = prk::range(0,n); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) { std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) { @@ -240,7 +240,7 @@ int main(int argc, char* argv[]) // compute L1 norm in parallel double norm = 0.0; - auto inside = boost::irange(radius,n-radius); + auto inside = prk::range(radius,n-radius); for (auto i : inside) { for (auto j : inside) { norm += std::fabs(out[i*n+j]); diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc index aef3a3880..040bde745 100644 --- a/Cxx11/stencil-vector-rangefor.cc +++ b/Cxx11/stencil-vector-rangefor.cc @@ -168,7 +168,7 @@ int main(int argc, char* argv[]) std::vector out(n*n); // initialize the input and output arrays - auto range = boost::irange(0,n); + auto range = prk::range(0,n); for (auto i : range) { for (auto j : range) { in[i*n+j] = static_cast(i+j); @@ -200,7 +200,7 @@ int main(int argc, char* argv[]) // compute L1 norm in parallel double norm = 0.0; - auto inside = boost::irange(radius,n-radius); + auto inside = prk::range(radius,n-radius); for (auto i : inside) { for (auto j : inside) { norm += std::fabs(out[i*n+j]); diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp index c1236b120..d6c1ee3eb 100644 --- a/Cxx11/stencil_pgnu.hpp +++ b/Cxx11/stencil_pgnu.hpp @@ -1,5 +1,5 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 @@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 @@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 @@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 @@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 @@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 @@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 @@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 @@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 @@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp index 70ad6cf66..8713da4d8 100644 --- a/Cxx11/stencil_pstl.hpp +++ b/Cxx11/stencil_pstl.hpp @@ -1,5 +1,5 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 @@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 @@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 @@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 @@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 @@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 @@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 @@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 @@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 @@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp index b6bf57581..c85964181 100644 --- a/Cxx11/stencil_rangefor.hpp +++ b/Cxx11/stencil_rangefor.hpp @@ -1,5 +1,5 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -14,7 +14,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -33,7 +33,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -56,7 +56,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -83,7 +83,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -114,7 +114,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -132,7 +132,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -164,7 +164,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -218,7 +218,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { @@ -302,7 +302,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); for (auto i : inside) { PRAGMA_SIMD for (auto j : inside) { diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp index ecde3e1ce..4dcdde467 100644 --- a/Cxx11/stencil_stl.hpp +++ b/Cxx11/stencil_stl.hpp @@ -1,5 +1,5 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 @@ -11,7 +11,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 @@ -27,7 +27,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 @@ -47,7 +47,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 @@ -71,7 +71,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 @@ -99,7 +99,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(1,n-1); + auto inside = prk::range(1,n-1); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 @@ -114,7 +114,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(2,n-2); + auto inside = prk::range(2,n-2); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 @@ -143,7 +143,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(3,n-3); + auto inside = prk::range(3,n-3); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 @@ -194,7 +194,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(4,n-4); + auto inside = prk::range(4,n-4); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 @@ -275,7 +275,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - auto inside = boost::irange(5,n-5); + auto inside = prk::range(5,n-5); std::for_each( std::begin(inside), std::end(inside), [&] (int i) { std::for_each( std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index 8b9734200..222322bd8 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -105,7 +105,7 @@ int main(int argc, char * argv[]) // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); - auto range = boost::irange(0,order); + auto range = prk::range(0,order); auto trans_time = 0.0; diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc index e02047a6d..3d2e4f9f1 100644 --- a/Cxx11/transpose-vector-rangefor.cc +++ b/Cxx11/transpose-vector-rangefor.cc @@ -109,17 +109,17 @@ int main(int argc, char * argv[]) // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); - auto itrange = boost::irange(0,order,tile_size); - auto jtrange = boost::irange(0,order,tile_size); + auto itrange = prk::range(0,order,tile_size); + auto jtrange = prk::range(0,order,tile_size); for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = prk::wtime(); for (auto it : itrange) { - auto irange = boost::irange(it,std::min(order,it+tile_size)); + auto irange = prk::range(it,std::min(order,it+tile_size)); for (auto jt : jtrange) { - auto jrange = boost::irange(jt,std::min(order,jt+tile_size)); + auto jrange = prk::range(jt,std::min(order,jt+tile_size)); for (auto i : irange) { for (auto j : jrange) { B[i*order+j] += A[j*order+i]; @@ -136,10 +136,10 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// // TODO: replace with std::generate, std::accumulate, or similar - const auto addit = (iterations+1.) * (iterations/2.); - auto abserr = 0.0; - auto irange = boost::irange(0,order); - auto jrange = boost::irange(0,order); + auto const addit = (iterations+1.) * (iterations/2.); + double abserr(0); + auto irange = prk::range(0,order); + auto jrange = prk::range(0,order); for (auto i : irange) { for (auto j : jrange) { const int ij = i*order+j; diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 3b3e6413a..d1a56e5bd 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -46,14 +46,14 @@ OPENCLFLAG=-framework OpenCL # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -#SYCLDIR=./triSYCL -#SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +SYCLDIR=./triSYCL +SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} +SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx -SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA # @@ -65,13 +65,15 @@ CILKFLAG=-fcilkplus # # TBB # -TBBDIR=/usr/local/Cellar/tbb/2018_U2 +TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include +BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc diff --git a/common/make.defs.intel b/common/make.defs.intel index 7ecd87ead..0dea4bb44 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -45,12 +45,12 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx -SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA # @@ -66,8 +66,10 @@ TBBFLAG=-DUSE_TBB -tbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} +BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/intel KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/intel diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 0ae50e78b..cdbd6510a 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -75,8 +75,8 @@ SYCLFLAG+=-std=c++14 # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +SYCLCXX=${CXX} -std=gnu++14 ${OPENMPFLAG} +SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS) # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -84,17 +84,20 @@ SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA +# OCCADIR=${HOME}/prk-repo/Cxx11/occa # # TBB # -TBBDIR=/usr/local/Cellar/tbb/2018_U3 +TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I/usr/local/Cellar/boost/1.65.1/include -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include +BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 73883df11..6c1e1a1ca 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -18,7 +18,7 @@ case "$os" in export MPI_ROOT=/usr/local ;; Linux) - export MPI_ROOT=$TRAVIS_ROOT + export MPI_ROOT=${TRAVIS_ROOT} ;; esac @@ -68,7 +68,7 @@ case "$PRK_TARGET" in export JULIA_PATH=/usr/local/bin/ ;; Linux) - export JULIA_PATH=$TRAVIS_ROOT/julia/bin/ + export JULIA_PATH=${TRAVIS_ROOT}/julia/bin/ ;; esac ${JULIA_PATH}julia --version @@ -442,11 +442,10 @@ case "$PRK_TARGET" in esac # Boost.Compute found after OpenCL, and only available in Travis with MacOS. - if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then - echo "BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE" >> common/make.defs - else - echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs - fi + echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs + + #echo "RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}" >> common/make.defs + echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs # C++11 with rangefor and Boost.Ranges make -C $PRK_TARGET_PATH rangefor @@ -511,9 +510,9 @@ case "$PRK_TARGET" in if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "osx" ] ; then if [ "${CC}" = "clang" ] ; then # omp.h not found with clang-3.9 - just work around instead of fixing. - echo "PSTLFLAG=-DUSE_PSTL ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs + echo "PSTLFLAG=-DUSE_PSTL ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs else - echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I$TRAVIS_ROOT/pstl/include" >> common/make.defs + echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs fi make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl 10 1024 1 @@ -666,7 +665,7 @@ case "$PRK_TARGET" in # Homebrew installs a symlink in /usr/local/bin export PRK_CAFC=/usr/local/bin/caf elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - export PRK_CAFC=$TRAVIS_ROOT/opencoarrays/bin/caf + export PRK_CAFC=${TRAVIS_ROOT}/opencoarrays/bin/caf fi echo "CAFC=$PRK_CAFC -std=f2008 -cpp" >> common/make.defs echo "COARRAYFLAG=-fcoarray=single" >> common/make.defs @@ -745,7 +744,7 @@ case "$PRK_TARGET" in export PRK_OVERSUBSCRIBE="--oversubscribe" export TMPDIR=/tmp elif [ "${TRAVIS_OS_NAME}" = "linux" ] ; then - export PRK_LAUNCHER=$TRAVIS_ROOT/opencoarrays/bin/cafrun + export PRK_LAUNCHER=${TRAVIS_ROOT}/opencoarrays/bin/cafrun fi $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/p2p-coarray 10 1024 1024 $PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-} $PRK_TARGET_PATH/stencil-coarray 10 1000 @@ -901,13 +900,13 @@ case "$PRK_TARGET" in allshmem) echo "SHMEM" # This should be fixed by rpath (https://github.com/regrant/sandia-shmem/issues/83) - export LD_LIBRARY_PATH=$TRAVIS_ROOT/sandia-openshmem/lib:$TRAVIS_ROOT/libfabric/lib:$LD_LIBRARY_PATH - export SHMEM_ROOT=$TRAVIS_ROOT/sandia-openshmem + export LD_LIBRARY_PATH=${TRAVIS_ROOT}/sandia-openshmem/lib:${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH + export SHMEM_ROOT=${TRAVIS_ROOT}/sandia-openshmem echo "SHMEMTOP=$SHMEM_ROOT\nSHMEMCC=$SHMEM_ROOT/bin/oshcc" >> common/make.defs make $PRK_TARGET export PRK_TARGET_PATH=SHMEM export PRK_SHMEM_PROCS=4 - export OSHRUN_LAUNCHER=$TRAVIS_ROOT/hydra/bin/mpirun + export OSHRUN_LAUNCHER=${TRAVIS_ROOT}/hydra/bin/mpirun export PRK_LAUNCHER=$SHMEM_ROOT/bin/oshrun $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Synch_p2p/p2p 10 1024 1024 $PRK_LAUNCHER -n $PRK_SHMEM_PROCS $PRK_TARGET_PATH/Stencil/stencil 10 1000 @@ -921,14 +920,14 @@ case "$PRK_TARGET" in case "$CC" in gcc) # If building from source (impossible) - #export UPC_ROOT=$TRAVIS_ROOT/gupc + #export UPC_ROOT=${TRAVIS_ROOT}/gupc # If installing deb file - export UPC_ROOT=$TRAVIS_ROOT/gupc/usr/local/gupc + export UPC_ROOT=${TRAVIS_ROOT}/gupc/usr/local/gupc ;; clang) echo "Clang UPC is not supported." exit 9 - export UPC_ROOT=$TRAVIS_ROOT/clupc + export UPC_ROOT=${TRAVIS_ROOT}/clupc ;; esac echo "UPCC=$UPC_ROOT/bin/upc" >> common/make.defs @@ -937,7 +936,7 @@ case "$PRK_TARGET" in make $PRK_TARGET ;; bupc) - export UPC_ROOT=$TRAVIS_ROOT/bupc-$CC + export UPC_ROOT=${TRAVIS_ROOT}/bupc-$CC echo "UPCC=$UPC_ROOT/bin/upcc" >> common/make.defs # -N $nodes -n UPC threads -c $cores_per_node # -localhost is only for UDP @@ -947,7 +946,7 @@ case "$PRK_TARGET" in ;; ofi) export GASNET_SSH_SERVERS="localhost" - export LD_LIBRARY_PATH="$TRAVIS_ROOT/libfabric/lib:$LD_LIBRARY_PATH" + export LD_LIBRARY_PATH="${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH" export PRK_LAUNCHER="$UPC_ROOT/bin/upcrun -v -N 1 -n $PRK_UPC_PROCS -c $PRK_UPC_PROCS" ;; mpi) @@ -978,12 +977,12 @@ case "$PRK_TARGET" in os=`uname` case "$os" in Darwin) - export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-darwin-x86_64-smp + export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp ;; Linux) - #export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64 - export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64-smp - #export CHARM_ROOT=$TRAVIS_ROOT/charm/multicore-linux64 + #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64 + export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp + #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64 ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs @@ -1002,12 +1001,12 @@ case "$PRK_TARGET" in os=`uname` case "$os" in Darwin) - export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-darwin-x86_64-smp + export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-darwin-x86_64-smp ;; Linux) - #export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64 - export CHARM_ROOT=$TRAVIS_ROOT/charm/netlrts-linux-x86_64-smp - #export CHARM_ROOT=$TRAVIS_ROOT/charm/multicore-linux64 + #export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64 + export CHARM_ROOT=${TRAVIS_ROOT}/charm/netlrts-linux-x86_64-smp + #export CHARM_ROOT=${TRAVIS_ROOT}/charm/multicore-linux64 ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs @@ -1036,7 +1035,7 @@ case "$PRK_TARGET" in ;; allfgmpi) echo "Fine-Grain MPI (FG-MPI)" - export FGMPI_ROOT=$TRAVIS_ROOT/fgmpi + export FGMPI_ROOT=${TRAVIS_ROOT}/fgmpi echo "FGMPITOP=$FGMPI_ROOT\nFGMPICC=$FGMPI_ROOT/bin/mpicc -std=c99" >> common/make.defs make $PRK_TARGET export PRK_TARGET_PATH=FG_MPI @@ -1060,11 +1059,11 @@ case "$PRK_TARGET" in allgrappa) echo "Grappa" ######################## - #. $TRAVIS_ROOT/grappa/bin/settings.sh - export GRAPPA_PREFIX=$TRAVIS_ROOT/grappa - export SCRIPT_PATH=$TRAVIS_ROOT/grappa/bin + #. ${TRAVIS_ROOT}/grappa/bin/settings.sh + export GRAPPA_PREFIX=${TRAVIS_ROOT}/grappa + export SCRIPT_PATH=${TRAVIS_ROOT}/grappa/bin ######################## - echo "GRAPPATOP=$TRAVIS_ROOT/grappa" >> common/make.defs + echo "GRAPPATOP=${TRAVIS_ROOT}/grappa" >> common/make.defs make $PRK_TARGET export PRK_TARGET_PATH=GRAPPA export PRK_MPI_PROCS=2 @@ -1087,7 +1086,7 @@ case "$PRK_TARGET" in ;; alllegion) echo "Legion" - echo "LEGIONTOP=$TRAVIS_ROOT/legion" > common/make.defs + echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs make $PRK_TARGET -k ;; esac diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 3917e2fec..a82df34cc 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -61,6 +61,7 @@ case "$PRK_TARGET" in fi sh ./travis/install-tbb.sh $TRAVIS_ROOT sh ./travis/install-pstl.sh $TRAVIS_ROOT + sh ./travis/install-ranges.sh $TRAVIS_ROOT # Boost is whitelisted and obtained from package manager if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then sh ./travis/install-boost.sh $TRAVIS_ROOT diff --git a/travis/install-ranges.sh b/travis/install-ranges.sh new file mode 100644 index 000000000..fda3e48aa --- /dev/null +++ b/travis/install-ranges.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +set -e +set -x + +TRAVIS_ROOT="$1" + +git clone --depth 1 https://github.com/ericniebler/range-v3.git $TRAVIS_ROOT/range-v3 From e2e2c94d591ec9533bec5a4ccfd8ddc92abd794d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 1 May 2018 11:51:16 -0700 Subject: [PATCH 082/245] make ranges TS the default for GCC and Clang. (#339) ICC 18.0.1 bug report has been filed for inability to compile the ranges TS implementation. we leave Boost as the default there. fixed a bug identied by ICC in prk_util.h --- Cxx11/prk_util.h | 2 +- common/make.defs.gcc | 4 ++-- common/make.defs.intel | 4 ++-- common/make.defs.llvm | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index e1576e3a1..868f8b8c9 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -268,7 +268,7 @@ namespace prk { template auto range(S start, E end, B blocking) { #if defined(USE_BOOST_IRANGE) - return boost::irange(static_cast(start), end, decltype(end)>(blocking) ); + return boost::irange(static_cast(start), end, static_cast(blocking) ); #elif defined(USE_RANGES_TS) // NOTE: // iota(s) | slice(s,e) | stride(b) is faster than diff --git a/common/make.defs.gcc b/common/make.defs.gcc index d1a56e5bd..5477603e0 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -71,8 +71,8 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} diff --git a/common/make.defs.intel b/common/make.defs.intel index 0dea4bb44..a4d515ba0 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -66,9 +66,9 @@ TBBFLAG=-DUSE_TBB -tbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +#BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/intel KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl diff --git a/common/make.defs.llvm b/common/make.defs.llvm index cdbd6510a..481a624ea 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -95,8 +95,8 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # Parallel STL, Boost, etc. # BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include -RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl From b123d147652d056f83f3ffbcadb4244fc0ac6e73 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 1 May 2018 11:53:02 -0700 Subject: [PATCH 083/245] p2p openmp tasks improved (#338) * improved p2p w/ OpenMP Tasks (C, C++, Fortran) instead of an N:1 and 1:N dependency on grid[0], just do taskwait at the end of the wavefront sweep. also remove the unnecessary diagonal dependency, which is implied by the horizontal and vertical dependencies. Author: Vishakha Agrawal of Intel * interchange loops to match Fortran order * add restrict to sweep_tile in C and C++ * add restrict pointer to grid.data() in C++ * replace auto with int in C++ --- C1z/p2p-simd-openmp.c | 2 +- C1z/p2p-sse.c | 2 +- C1z/p2p-tasks-openmp.c | 10 +++------- C1z/p2p.c | 2 +- Cxx11/p2p-tasks-openmp.cc | 30 +++++++++++++----------------- Cxx11/p2p-vector.cc | 30 ++++++++++++++++-------------- FORTRAN/p2p-tasks-openmp.f90 | 15 +++++---------- 7 files changed, 40 insertions(+), 51 deletions(-) diff --git a/C1z/p2p-simd-openmp.c b/C1z/p2p-simd-openmp.c index 6a02b96a1..a9444d02f 100644 --- a/C1z/p2p-simd-openmp.c +++ b/C1z/p2p-simd-openmp.c @@ -63,7 +63,7 @@ static inline void sweep_tile(int startm, int endm, int startn, int endn, - int n, double grid[]) + int n, double grid[restrict]) { for (int i=startm; i(j); } - for (auto i=0; i(i); } - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) pipeline_time = prk::wtime(); - for (auto i=1; i & grid) + int n, double * RESTRICT grid) { - for (auto i=startm; i(j); } - for (auto i=0; i(i); } - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) pipeline_time = prk::wtime(); + double * RESTRICT pgrid = grid.data(); + if (mc==m && nc==n) { - for (auto i=1; i Date: Thu, 3 May 2018 19:05:41 -0500 Subject: [PATCH 084/245] TBB Flow Graph version of p2p (#340) * Add TBB Flow Graph version of p2p Pipeline kernel * add target to Makefile * Add TBB flags for Flow Graph Analyzer tracing support * replace block_node_body with lambda expression --- Cxx11/Makefile | 4 +- Cxx11/p2p-tasks-tbb.cc | 271 +++++++++++++++++++++++++++++++++++++++++ common/make.defs.intel | 1 + 3 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 Cxx11/p2p-tasks-tbb.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3a26ead0b..484a232b4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -69,7 +69,7 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ - p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc + p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ @@ -102,7 +102,7 @@ opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ - p2p-hyperplane-vector-tbb + p2p-hyperplane-vector-tbb p2p-tasks-tbb stl: stencil-vector-stl transpose-vector-stl nstream-vector-stl diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc new file mode 100644 index 000000000..3142a1e42 --- /dev/null +++ b/Cxx11/p2p-tasks-tbb.cc @@ -0,0 +1,271 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// TBB implementation by Pablo Reble, April 2018. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include "tbb/flow_graph.h" +#include "tbb/parallel_for.h" + +inline void sweep_tile(int startm, int endm, + int startn, int endn, + int n, double grid[]) +{ + for (auto i=startm; i [ ]"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + m = std::atoi(argv[2]); + n = std::atoi(argv[3]); + if (m < 1 || n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(m)*static_cast(n) > INT_MAX) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + mc = (argc > 4) ? std::atoi(argv[4]) : m; + nc = (argc > 5) ? std::atoi(argv[5]) : n; + if (mc < 1 || mc > m || nc < 1 || nc > n) { + std::cout << "WARNING: grid chunk dimensions invalid: " << mc << nc << " (ignoring)" << std::endl; + mc = m; + nc = n; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + const char* envvar = std::getenv("TBB_NUM_THREADS"); + int num_threads = (envvar!=NULL) ? std::atoi(envvar) : tbb::task_scheduler_init::default_num_threads(); + tbb::task_scheduler_init init(num_threads); + + std::cout << "Number of threads = " << num_threads << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << m << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << mc << ", " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Create Grid and allocate space + ////////////////////////////////////////////////////////////////////// + // calculate number of tiles in n and m direction to create grid. + int num_blocks_n = (n / nc); + if(n%nc != 0) num_blocks_n++; + int num_blocks_m = (m / mc); + if(m%mc != 0) num_blocks_m++; + + auto pipeline_time = 0.0; // silence compiler warning + + double * grid = new double[m*n]; + + typedef tbb::flow::continue_node< tbb::flow::continue_msg > block_node_t; + + graph g; + block_node_t *nodes[ num_blocks_n * num_blocks_m ]; + // To enable tracing support for Flow Graph Analyzer + // set following MACRO and link against TBB preview library (-ltbb_preview) +#if TBB_PREVIEW_FLOW_GRAPH_TRACE + char buffer[1024]; + g.set_name("Pipeline"); +#endif + + bool first_iter=true; + block_node_t b(g, [&](const tbb::flow::continue_msg &){ + grid[0*n+0] = -grid[(m-1)*n+(n-1)]; + if(first_iter) pipeline_time = prk::wtime(); + first_iter = false; + }); + for (int i=0; iset_name( buffer ); +#endif + nodes[i*num_blocks_n + j] = tmp; + if (i>0) + make_edge(*nodes[(i-1)*num_blocks_n + j ], *tmp ); + if (j>0) + make_edge(*nodes[ i *num_blocks_n + j-1], *tmp ); + // Transitive dependencies from OpenMP task version: + //make_edge( *tmp, b ); + //if (i>0 && j>0) + // make_edge(*nodes[(i-1)*num_blocks_n + j-1], *tmp ); + } + } + auto start = true; + source_node s(g, [&](continue_msg &v) -> bool { + if(start) { + v = continue_msg(); + start = false; + return true; + } + return false; + }, false); + + limiter_node l(g, iterations+1, 1); + + make_edge( s, l ); + make_edge( l, *nodes[0] ); + make_edge( *nodes[(num_blocks_n * num_blocks_m) - 1], b); + make_edge( b, l ); + +#if TBB_PREVIEW_FLOW_GRAPH_TRACE + s.set_name("Source"); + b.set_name("Iteration Barrier"); + l.set_name("Limiter"); +#endif + + ////////////////////////////////////////////////////////////////////// + // Perform the computation + ////////////////////////////////////////////////////////////////////// + + { + + tbb::blocked_range2d range(0, m, mc, 0, n, nc); + tbb::parallel_for( range, [&](decltype(range)& r) { + for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { + for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) { + grid[i*n+j] = 0.0; + } + } + }, tbb_partitioner); + for (auto j=0; j(j); + } + for (auto i=0; i(i); + } + + s.activate(); + g.wait_for_all(); + + pipeline_time = prk::wtime() - pipeline_time; + + } + + ////////////////////////////////////////////////////////////////////// + // Cleanup Flow Graph + ////////////////////////////////////////////////////////////////////// + + for (int i=0; i epsilon) { + std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/common/make.defs.intel b/common/make.defs.intel index a4d515ba0..556d940b6 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -63,6 +63,7 @@ CILKFLAG=-intel-extensions # default # TBB # TBBFLAG=-DUSE_TBB -tbb +#TBBFLAG=-DUSE_TBB -tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE # # Parallel STL, Boost, etc. # From 59b80af2fe981f1d094768eda45d3db2b8b6a6dc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 3 May 2018 17:06:20 -0700 Subject: [PATCH 085/245] TBB flowgraph (#341) * Add TBB Flow Graph version of p2p Pipeline kernel * add target to Makefile * Add TBB flags for Flow Graph Analyzer tracing support * replace block_node_body with lambda expression * add Travis for TBB tasks (flowgraph) --- travis/build-run-prk.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 6c1e1a1ca..350bfb353 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -480,6 +480,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/p2p-innerloop-vector-tbb 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 32 + $PRK_TARGET_PATH/p2p-tasks-tbb 10 1024 1024 32 32 $PRK_TARGET_PATH/stencil-vector-tbb 10 1000 $PRK_TARGET_PATH/transpose-vector-tbb 10 1024 32 $PRK_TARGET_PATH/nstream-vector-tbb 10 16777216 32 From eea8494ffafe56c6cbf000c285a5b09b1d419443 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 10 May 2018 15:04:17 -0700 Subject: [PATCH 086/245] build p2p-tasks-tbb --- travis/build-run-prk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 350bfb353..00b4395e6 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -476,7 +476,7 @@ case "$PRK_TARGET" in export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH} ;; esac - make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb + make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb $PRK_TARGET_PATH/p2p-innerloop-vector-tbb 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 32 From f09025bb86003f178f7a6213d6cde99634072a73 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 10 May 2018 15:52:18 -0700 Subject: [PATCH 087/245] Thrust nstream (#342) * Thrust transpose working with host vector, device still busted * Thrust nstream working with host and device --- .gitignore | 4 + Cxx11/Makefile | 19 +++- Cxx11/nstream-device-thrust.cu | 179 +++++++++++++++++++++++++++++ Cxx11/nstream-host-thrust.cc | 179 +++++++++++++++++++++++++++++ Cxx11/prk_util.h | 15 +++ Cxx11/transpose-device-thrust.cu | 188 +++++++++++++++++++++++++++++++ Cxx11/transpose-host-thrust.cc | 156 +++++++++++++++++++++++++ common/make.defs.gcc | 8 +- 8 files changed, 743 insertions(+), 5 deletions(-) create mode 100644 Cxx11/nstream-device-thrust.cu create mode 100644 Cxx11/nstream-host-thrust.cc create mode 100644 Cxx11/transpose-device-thrust.cu create mode 100644 Cxx11/transpose-host-thrust.cc diff --git a/.gitignore b/.gitignore index d5100141d..6cd0e154b 100644 --- a/.gitignore +++ b/.gitignore @@ -179,7 +179,11 @@ Cxx11/transpose-vector-raja Cxx11/transpose-vector-rangefor Cxx11/transpose-vector-tbb Cxx11/transpose-vector-taskloop +Cxx11/transpose-vector-async +Cxx11/transpose-vector-thread Cxx11/transpose-kokkos +Cxx11/transpose-device-thrust +Cxx11/transpose-host-thrust Cxx11/transpose-cublas Cxx11/transpose-cuda Cxx11/grid1.cl diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 484a232b4..07be4fa0b 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -45,7 +45,8 @@ RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG) STLFLAGS = $(STLFLAG) $(RANGEFLAGS) PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) RAJAFLAGS = $(RAJAFLAG) -KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) +THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST +KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR @@ -54,7 +55,7 @@ endif OCCAFLAGS = -DUSE_OCCA -I${OCCADIR}/include -Wl,-rpath -Wl,${OCCADIR}/lib -L${OCCADIR}/lib -locca .PHONY: all clean vector valarray openmp target opencl taskloop tbb stl pstl \ - rangefor kokkos raja cuda cublas sycl boost-compute + rangefor kokkos raja cuda cublas sycl boost-compute thrust EXTRA= ifeq ($(shell uname -s),Darwin) @@ -116,6 +117,11 @@ raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-r cuda: stencil-cuda transpose-cuda nstream-cuda +thrust: nstream-host-thrust nstream-device-thrust \ + transpose-host-thrust transpose-device-thrust + +cuda: transpose-cuda + cublas: transpose-cublas nstream-cublas dgemm-cublas occa: transpose-occa nstream-occa @@ -173,6 +179,14 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) $(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@ +# for host execution +%-thrust: %-thrust.cc prk_util.h + $(CXX) $(CXXFLAGS) $< $(THRUSTFLAGS) -o $@ + +# for device execution (must compiler as .cu) +%-thrust: %-thrust.cu prk_util.h + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< $(THRUSTFLAGS) -o $@ + %-cuda: %-cuda.cu prk_util.h prk_cuda.h $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@ @@ -218,6 +232,7 @@ clean: -rm -f *-rangefor -rm -f *-raja -rm -f *-kokkos + -rm -f *-thrust -rm -f *-cuda -rm -f *-cublas -rm -f *-cblas diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu new file mode 100644 index 000000000..7ea49dabd --- /dev/null +++ b/Cxx11/nstream-device-thrust.cu @@ -0,0 +1,179 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Thrust STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + thrust::device_vector A(length); + thrust::device_vector B(length); + thrust::device_vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + { + thrust::fill(thrust::device, A.begin(), A.end(), 0.0); + thrust::fill(thrust::device, B.begin(), B.end(), 2.0); + thrust::fill(thrust::device, C.begin(), C.end(), 2.0); + + auto nstream = [=] __host__ __device__ (thrust::tuple t) { + thrust::get<0>(t) += thrust::get<1>(t) + scalar * thrust::get<2>(t); + }; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + thrust::for_each( thrust::device, + thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), + nstream); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + //double asum = thrust::reduce(A.begin(), A.end(), 0.0, thrust::plus()); + double asum = thrust::transform_reduce(A.begin(), + A.end(), + [=] __host__ __device__ (double x) -> double { return fabs(x); }, + 0.0, + thrust::plus()); + + double epsilon(1.e-8); + if (std::fabs(ar-asum)/asum > epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc new file mode 100644 index 000000000..5bc29d145 --- /dev/null +++ b/Cxx11/nstream-host-thrust.cc @@ -0,0 +1,179 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Thrust STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + thrust::host_vector A(length); + thrust::host_vector B(length); + thrust::host_vector C(length); + + auto range = prk::range(static_cast(0), length); + + double scalar(3); + { + thrust::fill(thrust::host, A.begin(), A.end(), 0.0); + thrust::fill(thrust::host, B.begin(), B.end(), 2.0); + thrust::fill(thrust::host, C.begin(), C.end(), 2.0); + + auto nstream = [=] __host__ __device__ (thrust::tuple t) { + thrust::get<0>(t) += thrust::get<1>(t) + scalar * thrust::get<2>(t); + }; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + thrust::for_each( thrust::host, + thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), + thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), + nstream); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (auto i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + //double asum = thrust::reduce(A.begin(), A.end(), 0.0, thrust::plus()); + double asum = thrust::transform_reduce(A.begin(), + A.end(), + [=] __host__ __device__ (double x) -> double { return std::fabs(x); }, + 0.0, + thrust::plus()); + + double epsilon(1.e-8); + if (std::fabs(ar-asum)/asum > epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 868f8b8c9..76363d8e2 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -225,6 +225,21 @@ const T prk_reduce(I first, I last, T init) { # include "RAJA/RAJA.hpp" #endif +#ifdef USE_THRUST +# ifdef __NVCC__ +# include +# endif +# include +# include +# include +# include +# include +# include +# include +# include +# include +#endif + #ifdef USE_SYCL # include "CL/sycl.hpp" #endif diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu new file mode 100644 index 000000000..b4c9a1874 --- /dev/null +++ b/Cxx11/transpose-device-thrust.cu @@ -0,0 +1,188 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +struct x : public thrust::unary_function +{ + int i; + int order; + thrust::device_vector & A; + thrust::device_vector & B; + + x(int i, int order, thrust::device_vector & A, thrust::device_vector & B) : + i(i), order(order), A(A), B(B) {} + + __host__ __device__ + void operator()(int j) + { + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + return; + } +}; + +//__device__ +void transpose(const int order, thrust::device_vector & A, thrust::device_vector & B) +{ + thrust::counting_iterator start(0); + thrust::counting_iterator end = start + order; + thrust::for_each( thrust::device, start, end, [=,&A,&B] (int i) { + thrust::for_each( thrust::device, start, end, x(i,order,A,B) ); + }); +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Thrust Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + thrust::device_vector A(order*order); + thrust::device_vector B(order*order); + // fill A with the sequence 0 to order^2-1 as doubles + thrust::sequence(thrust::device, A.begin(), A.end() ); + thrust::fill(thrust::device, B.begin(), B.end(), 0.0); + + auto range = boost::irange(0,order); + + auto trans_time = 0.0; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + +#if 1 + transpose(order, A, B); +#else + thrust::for_each( std::begin(range), std::end(range), [=,&A,&B] (int i) { + thrust::for_each( std::begin(range), std::end(range), [=,&A,&B] (int j) { + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + }); + }); +#endif + } + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const auto addit = (iterations+1.) * (iterations/2.); + auto abserr = 0.0; + for (auto i : range) { + for (auto j : range) { + const int ij = i*order+j; + const int ji = j*order+i; + const double reference = static_cast(ij)*(1.+iterations)+addit; + abserr += std::fabs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc new file mode 100644 index 000000000..53066208b --- /dev/null +++ b/Cxx11/transpose-host-thrust.cc @@ -0,0 +1,156 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/Thrust Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + thrust::host_vector A(order*order); + thrust::host_vector B(order*order); + // fill A with the sequence 0 to order^2-1 as doubles + thrust::sequence(thrust::host, A.begin(), A.end() ); + thrust::fill(thrust::host, B.begin(), B.end(), 0.0); + + auto range = boost::irange(0,order); + + auto trans_time = 0.0; + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + // transpose + thrust::for_each( thrust::host, std::begin(range), std::end(range), [&] (int i) { + thrust::for_each( thrust::host, std::begin(range), std::end(range), [&] (int j) { + B[i*order+j] += A[j*order+i]; + A[j*order+i] += 1.0; + }); + }); + } + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const auto addit = (iterations+1.) * (iterations/2.); + auto abserr = 0.0; + for (auto i : range) { + for (auto j : range) { + const int ij = i*order+j; + const int ji = j*order+i; + const double reference = static_cast(ij)*(1.+iterations)+addit; + abserr += std::fabs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 5477603e0..ad1f2fcf2 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -78,6 +78,8 @@ KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust +THRUSTFLAG=-DUSE_THRUST -I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # # CBLAS for C++ DGEMM # @@ -86,10 +88,10 @@ CBLASFLAG=-DACCELERATE -framework Accelerate # CUDA flags # # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander -NVCC=/opt/llvm/cocl/bin/cocl +#NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -#NVCC=nvcc -CUDAFLAGS=-g -O3 -std=c++11 +NVCC=nvcc +CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # From 79637063cfac0123c05853a232dd64ae811848a1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 11 May 2018 22:06:24 -0700 Subject: [PATCH 088/245] nstream needs to use size_t loop index --- Cxx11/nstream-vector-pstl.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index dbc52aaf4..852db5de5 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -126,9 +126,10 @@ int main(int argc, char * argv[]) { #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) { + std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) +#warning GNU parallel __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) { #else std::for_each( std::begin(range), std::end(range), [&] (size_t i) { @@ -143,7 +144,7 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (int i) { + std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) { From bcb65326ea1b590acd6b97880898c626d3dbcf88 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 11 May 2018 22:12:55 -0700 Subject: [PATCH 089/245] GCC 8 released [ci skip] --- common/make.defs.gcc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index ad1f2fcf2..586cec08c 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-7 +VERSION=-8 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -73,6 +73,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +#PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG} PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} From 292029c2f419283f392dd45c4b9b5fdc9954ca28 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 11 May 2018 22:13:31 -0700 Subject: [PATCH 090/245] update C++ support matrix in README [ci skip] --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 871ea9272..03fedd850 100644 --- a/README.md +++ b/README.md @@ -92,12 +92,14 @@ f = see footnotes | SYCL | | y | y | y | | | | Boost.Compute | | | | y | | | | Parallel STL | y | y | y | y | | | -| TBB | i | y | y | y | | | +| Thrust | y | | | | | | +| TBB | y | y | y | y | | | | Kokkos | y | y | y | y | | | | RAJA | y | y | y | y | | | | CUDA | i | y | y | y | | | -| CUBLAS | | | y | y | | | +| CUBLAS | | | y | y | | y | | CBLAS | | | | | | y | +| OpenACC | y | | | | | | * [SYCL](http://sycl.tech/) * [Boost.Compute](http://boostorg.github.io/compute/) From c3fd2463e02c97c93651fcb854ff1d06c5301bb8 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 11 May 2018 22:31:14 -0700 Subject: [PATCH 091/245] p2p scalar references (#343) * apply scalar optimizations to p2p Intel Fortran turns array references into scalars. Intel C/C++ will do it for 2D arrays but that's tricky here without VLAs. The optimization can be performed by hand, which is what this commit is. Thanks to Martyn Corden of Intel for analyzing this issue and providing the improved implementation. * extra p2p kernel into seperate tile * need static for inline to link --- C1z/p2p-kernel.h | 33 ++++++++++++++ C1z/p2p-simd-openmp.c | 14 +----- C1z/p2p-tasks-openmp.c | 12 +---- C1z/p2p.c | 12 +---- Cxx11/p2p-hyperplane-sycl.cc | 27 +---------- Cxx11/p2p-hyperplane-vector-openmp.cc | 29 +----------- Cxx11/p2p-hyperplane-vector-ornlacc.cc | 3 +- Cxx11/p2p-hyperplane-vector-pstl.cc | 29 +----------- Cxx11/p2p-hyperplane-vector-tbb.cc | 29 +----------- Cxx11/p2p-kernel.h | 63 ++++++++++++++++++++++++++ Cxx11/p2p-tasks-openmp.cc | 12 +---- Cxx11/p2p-tasks-tbb.cc | 12 +---- Cxx11/p2p-vector.cc | 20 ++++---- 13 files changed, 118 insertions(+), 177 deletions(-) create mode 100644 C1z/p2p-kernel.h create mode 100644 Cxx11/p2p-kernel.h diff --git a/C1z/p2p-kernel.h b/C1z/p2p-kernel.h new file mode 100644 index 000000000..ef2ea082a --- /dev/null +++ b/C1z/p2p-kernel.h @@ -0,0 +1,33 @@ +#if 1 + +static inline void sweep_tile(int startm, int endm, + int startn, int endn, + int n, double * restrict grid) +{ + for (int i=startm; i & grid) -{ - for (auto i=startm; i & grid) -{ - for (auto i=2; i<=2*n-2; i++) { - for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { - const auto x = i-j+1; - const auto y = j-1; - grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; - } - } -} -#endif +#include "p2p-kernel.h" int main(int argc, char* argv[]) { diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc index 4a3f317ae..6bd35bcfe 100644 --- a/Cxx11/p2p-hyperplane-vector-openmp.cc +++ b/Cxx11/p2p-hyperplane-vector-openmp.cc @@ -60,32 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" - -inline void sweep_tile_sequential(int startm, int endm, - int startn, int endn, - int n, double grid[]) -{ - for (auto i=startm; i & grid) -{ - for (auto i=startm; i & grid) -{ - for (auto i=2; i<=2*n-2; i++) { - for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { - const auto x = i-j+1; - const auto y = j-1; - grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; - } - } -} -#endif +#include "p2p-kernel.h" int main(int argc, char* argv[]) { @@ -190,7 +165,7 @@ int main(int argc, char* argv[]) #endif const int ib = nc*(i-j)+1; const int jb = nc*(j-2)+1; - sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); + sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); }); } } diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc index 6c4ad9aac..863580a0c 100644 --- a/Cxx11/p2p-hyperplane-vector-tbb.cc +++ b/Cxx11/p2p-hyperplane-vector-tbb.cc @@ -60,32 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" - -inline void sweep_tile_sequential(int startm, int endm, - int startn, int endn, - int n, std::vector & grid) -{ - for (auto i=startm; i & grid) -{ - for (auto i=2; i<=2*n-2; i++) { - for (auto j=std::max(2,i-n+2); j<=std::min(i,n); j++) { - const auto x = i-j+1; - const auto y = j-1; - grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; - } - } -} -#endif +#include "p2p-kernel.h" int main(int argc, char* argv[]) { @@ -176,7 +151,7 @@ int main(int argc, char* argv[]) tbb::parallel_for( std::max(2,i-(nb+1)+2), std::min(i,nb+1)+1, [=,&grid](int j) { const int ib = nc*(i-j)+1; const int jb = nc*(j-2)+1; - sweep_tile_sequential(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); + sweep_tile(ib, std::min(n,ib+nc), jb, std::min(n,jb+nc), n, grid); }); } } diff --git a/Cxx11/p2p-kernel.h b/Cxx11/p2p-kernel.h new file mode 100644 index 000000000..f402eba37 --- /dev/null +++ b/Cxx11/p2p-kernel.h @@ -0,0 +1,63 @@ +#define RESTRICT __restrict__ + +#if 1 + +inline void sweep_tile(int startm, int endm, + int startn, int endn, + int n, double * RESTRICT grid) +{ + for (int i=startm; i & grid) +{ + for (auto i=startm; i & grid) +{ + for (int i=startm; i Date: Thu, 17 May 2018 08:42:14 -0700 Subject: [PATCH 092/245] thrust has nstream, not p2p --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 03fedd850..8bd22e28a 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,7 @@ f = see footnotes | SYCL | | y | y | y | | | | Boost.Compute | | | | y | | | | Parallel STL | y | y | y | y | | | -| Thrust | y | | | | | | +| Thrust | | | | y | | | | TBB | y | y | y | y | | | | Kokkos | y | y | y | y | | | | RAJA | y | y | y | y | | | From d8473d71f9960233ed08d74570a67e8c56d65b8a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 21 May 2018 10:01:07 -0700 Subject: [PATCH 093/245] Thrust nstream fix (#346) add device sync --- Cxx11/nstream-device-thrust.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu index 7ea49dabd..13cd1a4e5 100644 --- a/Cxx11/nstream-device-thrust.cu +++ b/Cxx11/nstream-device-thrust.cu @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_cuda.h" int main(int argc, char * argv[]) { @@ -134,6 +135,7 @@ int main(int argc, char * argv[]) thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())), thrust::make_zip_iterator(thrust::make_tuple(A.end() , B.end() , C.end())), nstream); + prk::CUDA::check( cudaDeviceSynchronize() ); } nstream_time = prk::wtime() - nstream_time; } From 23149c19cc873a1458c77f1cf101bf3ab62d0e1f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 22 May 2018 08:24:12 -0700 Subject: [PATCH 094/245] remove std::transform (#344) * remove std::transform * preserve access pattern --- Cxx11/stencil-vector-openmp.cc | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-vector-openmp.cc index 98343a798..5f5e59f42 100644 --- a/Cxx11/stencil-vector-openmp.cc +++ b/Cxx11/stencil-vector-openmp.cc @@ -204,22 +204,17 @@ int main(int argc, char* argv[]) // Apply the stencil operator stencil(n, tile_size, in, out); // Add constant to solution to force refresh of neighbor data, if any -#ifdef _OPENMP OMP_FOR( collapse(2) ) for (auto it=0; it Date: Wed, 23 May 2018 06:18:44 -0700 Subject: [PATCH 095/245] split C++ headers (#349) * refactor headers * remove unnecessary preprocessor token * relocate preprocess tokens * update make.def examples * further localize include files where needed --- .gitignore | 6 + Cxx11/Makefile | 18 +- Cxx11/dgemm-cblas.cc | 2 +- Cxx11/dgemm-cublas.cu | 2 +- Cxx11/dgemm-vector.cc | 2 +- Cxx11/nstream-host-thrust.cc | 1 + Cxx11/nstream-kokkos.cc | 1 + Cxx11/nstream-occa.cc | 2 + Cxx11/nstream-sycl.cc | 2 + Cxx11/nstream-valarray-boost-compute.cc | 2 + Cxx11/nstream-vector-boost-compute.cc | 2 +- Cxx11/nstream-vector-pstl.cc | 1 + Cxx11/nstream-vector-raja.cc | 1 + Cxx11/nstream-vector-tbb.cc | 1 + Cxx11/p2p-hyperplane-sycl.cc | 2 + Cxx11/p2p-hyperplane-vector-openmp.cc | 1 + Cxx11/p2p-hyperplane-vector-pstl.cc | 1 + Cxx11/p2p-hyperplane-vector-tbb.cc | 1 + Cxx11/p2p-innerloop-vector-tbb.cc | 1 + Cxx11/p2p-tasks-tbb.cc | 4 +- Cxx11/p2p-vector-raja.cc | 1 + Cxx11/p2p-vector-tbb.cc | 1 + Cxx11/prk_kokkos.h | 41 +++++ Cxx11/prk_openmp.h | 94 ++++++++++ Cxx11/prk_pstl.h | 52 ++++++ Cxx11/prk_raja.h | 40 +++++ Cxx11/prk_ranges.h | 74 ++++++++ Cxx11/prk_simd.h | 52 ++++++ Cxx11/prk_tbb.h | 55 ++++++ Cxx11/prk_thrust.h | 50 ++++++ Cxx11/prk_util.h | 218 +++--------------------- Cxx11/stencil-kokkos.cc | 1 + Cxx11/stencil-sycl.cc | 2 + Cxx11/stencil-vector-pstl.cc | 1 + Cxx11/stencil-vector-raja.cc | 1 + Cxx11/stencil-vector-tbb.cc | 1 + Cxx11/transpose-host-thrust.cc | 1 + Cxx11/transpose-kokkos.cc | 1 + Cxx11/transpose-occa.cc | 2 + Cxx11/transpose-sycl.cc | 2 + Cxx11/transpose-vector-pstl.cc | 1 + Cxx11/transpose-vector-raja.cc | 1 + Cxx11/transpose-vector-tbb.cc | 1 + common/make.defs.cray | 4 + common/make.defs.gcc | 14 +- common/make.defs.intel | 23 +-- common/make.defs.llvm | 19 ++- 47 files changed, 569 insertions(+), 237 deletions(-) create mode 100644 Cxx11/prk_kokkos.h create mode 100644 Cxx11/prk_openmp.h create mode 100644 Cxx11/prk_pstl.h create mode 100644 Cxx11/prk_raja.h create mode 100644 Cxx11/prk_ranges.h create mode 100644 Cxx11/prk_simd.h create mode 100644 Cxx11/prk_tbb.h create mode 100644 Cxx11/prk_thrust.h diff --git a/.gitignore b/.gitignore index 6cd0e154b..91cb027fc 100644 --- a/.gitignore +++ b/.gitignore @@ -112,6 +112,12 @@ C1z/transpose-openmp C1z/transpose-target C1z/transpose-taskloop C1z/transpose-ispc +Cxx11/boost +Cxx11/compute +Cxx11/triSYCL +Cxx11/occa +Cxx11/pstl +Cxx11/range-v3 Cxx11/dgemm-vector Cxx11/dgemm-cblas Cxx11/dgemm-cublas diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 07be4fa0b..652c423ba 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -31,22 +31,22 @@ endif #ASMFLAGS = -fsource-asm -fverbose-asm -fasm-blocks -fcode-asm ASMFLAGS = -fverbose-asm -OMPFLAGS = $(OPENMPFLAG) +OMPFLAGS = $(OPENMPFLAG) -DUSE_OPENMP TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS -SYCLFLAGS = $(SYCLFLAG) -DUSE_2D_INDEXING=0 +SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) -TBBFLAGS = $(TBBFLAG) -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) +TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) -BOOSTFLAGS = $(BOOSTFLAG) -RANGEFLAGS = -DUSE_RANGES $(RANGEFLAG) +BOOSTFLAGS = $(BOOSTFLAG) -DUSE_BOOST +RANGEFLAGS = $(RANGEFLAG) -DUSE_RANGES STLFLAGS = $(STLFLAG) $(RANGEFLAGS) -PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -RAJAFLAGS = $(RAJAFLAG) +PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL +RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST -KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) +KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR @@ -170,7 +170,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(RANGEFLAGS) -o $@ %-boost-compute: %-boost-compute.cc prk_util.h - $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) -DUSE_BOOST_COMPUTE $(OPENCLFLAGS) -o $@ + $(CXX) $(CXXFLAGS) $< $(BOOSTFLAGS) $(OPENCLFLAGS) -o $@ %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index cb0e44f51..8390b7c11 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -302,7 +302,7 @@ int main(int argc, char * argv[]) const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); double residuum(0); for (int b=0; b(order); const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); - const auto checksum = prk_reduce(C.begin(), C.end(), 0.0); + const auto checksum = prk::reduce(C.begin(), C.end(), 0.0); const auto epsilon = 1.0e-8; const auto residuum = std::abs(checksum-reference)/reference; diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc index 5bc29d145..c06c89108 100644 --- a/Cxx11/nstream-host-thrust.cc +++ b/Cxx11/nstream-host-thrust.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_thrust.h" int main(int argc, char * argv[]) { diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index 7e468abcf..d03a47207 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_kokkos.h" // We build with OpenMP unless it is not available... #ifndef PRK_KOKKOS_BACKEND diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc index 10b0b47fd..6d584e893 100644 --- a/Cxx11/nstream-occa.cc +++ b/Cxx11/nstream-occa.cc @@ -61,6 +61,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "occa.hpp" + #include "prk_util.h" int main(int argc, char * argv[]) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index c5d390341..2193d4811 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -62,6 +62,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "CL/sycl.hpp" + #include "prk_util.h" int main(int argc, char * argv[]) diff --git a/Cxx11/nstream-valarray-boost-compute.cc b/Cxx11/nstream-valarray-boost-compute.cc index 50c54846f..da3ded46f 100644 --- a/Cxx11/nstream-valarray-boost-compute.cc +++ b/Cxx11/nstream-valarray-boost-compute.cc @@ -62,6 +62,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "boost/compute.hpp" + #include "prk_util.h" namespace compute = boost::compute; diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc index fec24fbbf..619c02374 100644 --- a/Cxx11/nstream-vector-boost-compute.cc +++ b/Cxx11/nstream-vector-boost-compute.cc @@ -62,7 +62,7 @@ /// ////////////////////////////////////////////////////////////////////// -#define LAMBDA_MAKE_TUPLE 1 +#include "boost/compute.hpp" #include "prk_util.h" diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index 852db5de5..9eb18c78f 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_pstl.h" // See ParallelSTL.md for important information. diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc index 31c6434e7..62f92832f 100644 --- a/Cxx11/nstream-vector-raja.cc +++ b/Cxx11/nstream-vector-raja.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_raja.h" #if defined(RAJA_ENABLE_OPENMP) typedef RAJA::omp_parallel_for_exec thread_exec; diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc index cb73d3eda..0fbc777c2 100644 --- a/Cxx11/nstream-vector-tbb.cc +++ b/Cxx11/nstream-vector-tbb.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" int main(int argc, char * argv[]) { diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index 68e7fc712..a738beffa 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -59,6 +59,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "CL/sycl.hpp" + #include "prk_util.h" #include "p2p-kernel.h" diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-vector-openmp.cc index 6bd35bcfe..471ce336c 100644 --- a/Cxx11/p2p-hyperplane-vector-openmp.cc +++ b/Cxx11/p2p-hyperplane-vector-openmp.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_openmp.h" #include "p2p-kernel.h" int main(int argc, char* argv[]) diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc index 3e9030f33..e17412ac2 100644 --- a/Cxx11/p2p-hyperplane-vector-pstl.cc +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_pstl.h" #include "p2p-kernel.h" int main(int argc, char* argv[]) diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc index 863580a0c..9c523a369 100644 --- a/Cxx11/p2p-hyperplane-vector-tbb.cc +++ b/Cxx11/p2p-hyperplane-vector-tbb.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" #include "p2p-kernel.h" int main(int argc, char* argv[]) diff --git a/Cxx11/p2p-innerloop-vector-tbb.cc b/Cxx11/p2p-innerloop-vector-tbb.cc index 2bff51f15..1f58ab081 100644 --- a/Cxx11/p2p-innerloop-vector-tbb.cc +++ b/Cxx11/p2p-innerloop-vector-tbb.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" int main(int argc, char* argv[]) { diff --git a/Cxx11/p2p-tasks-tbb.cc b/Cxx11/p2p-tasks-tbb.cc index c19a55d44..266c87ad6 100644 --- a/Cxx11/p2p-tasks-tbb.cc +++ b/Cxx11/p2p-tasks-tbb.cc @@ -61,11 +61,9 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" #include "p2p-kernel.h" -#include "tbb/flow_graph.h" -#include "tbb/parallel_for.h" - int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc index 7dfeea21d..00164aa94 100644 --- a/Cxx11/p2p-vector-raja.cc +++ b/Cxx11/p2p-vector-raja.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_raja.h" int main(int argc, char* argv[]) { diff --git a/Cxx11/p2p-vector-tbb.cc b/Cxx11/p2p-vector-tbb.cc index bcc45b27b..74cf57819 100644 --- a/Cxx11/p2p-vector-tbb.cc +++ b/Cxx11/p2p-vector-tbb.cc @@ -60,6 +60,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" void SequentialSweep(int m, int n, std::vector & grid) { diff --git a/Cxx11/prk_kokkos.h b/Cxx11/prk_kokkos.h new file mode 100644 index 000000000..760ae91d7 --- /dev/null +++ b/Cxx11/prk_kokkos.h @@ -0,0 +1,41 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_KOKKOS_H +#define PRK_KOKKOS_H + +#ifdef USE_KOKKOS +# include +# include +# include +#endif + +#endif /* PRK_KOKKOS_H */ diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h new file mode 100644 index 000000000..4d6396b9b --- /dev/null +++ b/Cxx11/prk_openmp.h @@ -0,0 +1,94 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_OPENMP_H +#define PRK_OPENMP_H + +#define PRAGMA(x) _Pragma(#x) + +#ifdef _OPENMP +# include +# define OMP(x) PRAGMA(omp x) +# define OMP_PARALLEL(x) PRAGMA(omp parallel x) +# define OMP_PARALLEL_FOR_REDUCE(x) PRAGMA(omp parallel for reduction (x) ) +# define OMP_MASTER PRAGMA(omp master) +# define OMP_BARRIER PRAGMA(omp barrier) +# define OMP_FOR(x) PRAGMA(omp for x) +# define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) ) +// OpenMP SIMD if supported, else not. +# if (_OPENMP >= 201300) +# define OMP_SIMD PRAGMA(omp simd) +# define OMP_FOR_SIMD PRAGMA(omp for simd) +# define OMP_TASK(x) PRAGMA(omp task x) +# define OMP_TASKLOOP(x) PRAGMA(omp taskloop x ) +# if defined(__INTEL_COMPILER) +# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x ) +# else +# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x ) +# endif +# define OMP_TASKWAIT PRAGMA(omp taskwait) +# define OMP_ORDERED(x) PRAGMA(omp ordered x) +# define OMP_TARGET(x) PRAGMA(omp target x) +# define OMP_DECLARE_TARGET PRAGMA(omp declare target) +# define OMP_END_DECLARE_TARGET PRAGMA(omp end declare target) +# else +# define OMP_SIMD +# define OMP_FOR_SIMD PRAGMA(omp for) +# define OMP_TASK(x) +# define OMP_TASKLOOP(x) +# define OMP_TASKLOOP_COLLAPSE(n,x) +# define OMP_TASKWAIT +# define OMP_ORDERED(x) +# define OMP_TARGET(x) +# define OMP_DECLARE_TARGET +# define OMP_END_DECLARE_TARGET +# endif +#else +# define OMP(x) +# define OMP_PARALLEL(x) +# define OMP_PARALLEL_FOR_REDUCE(x) +# define OMP_MASTER +# define OMP_BARRIER +# define OMP_FOR(x) +# define OMP_FOR_REDUCE(x) +# define OMP_SIMD +# define OMP_FOR_SIMD +# define OMP_TASK(x) +# define OMP_TASKLOOP(x) +# define OMP_TASKLOOP_COLLAPSE(n,x) +# define OMP_TASKWAIT +# define OMP_ORDERED(x) +# define OMP_TARGET(x) +# define OMP_DECLARE_TARGET +# define OMP_END_DECLARE_TARGET +#endif + +#endif /* PRK_OPENMP_H */ diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h new file mode 100644 index 000000000..5c89d765f --- /dev/null +++ b/Cxx11/prk_pstl.h @@ -0,0 +1,52 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_PSTL_H +#define PRK_PSTL_H + +#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800) +#define USE_INTEL_PSTL +#endif + +#ifdef USE_PSTL +# ifdef USE_INTEL_PSTL +# include +# include +# include +# include +# elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \ + ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) +# include +# include +# endif +#endif + +#endif /* PRK_PSTL_H */ diff --git a/Cxx11/prk_raja.h b/Cxx11/prk_raja.h new file mode 100644 index 000000000..fb0bb25b8 --- /dev/null +++ b/Cxx11/prk_raja.h @@ -0,0 +1,40 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_RAJA_H +#define PRK_RAJA_H + +#ifdef USE_RAJA +# define RAJA_ENABLE_NESTED 1 +# include "RAJA/RAJA.hpp" +#endif + +#endif /* PRK_RAJA_H */ diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h new file mode 100644 index 000000000..d794016ff --- /dev/null +++ b/Cxx11/prk_ranges.h @@ -0,0 +1,74 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_RANGES_H +#define PRK_RANGES_H + +#if defined(USE_RANGES) +# if defined(USE_RANGES_IRANGE) +# include "boost/range/irange.hpp" +# elif defined(USE_RANGES_TS) +# include "range/v3/view/iota.hpp" +# include "range/v3/view/slice.hpp" +# include "range/v3/view/stride.hpp" +# else +# error You have not provided a version of ranges to use. +# endif +#endif + +namespace prk { + + template + auto range(S start, E end) { +#if defined(USE_BOOST_IRANGE) + return boost::irange(static_cast(start), end); +#elif defined(USE_RANGES_TS) + return ranges::view::iota(static_cast(start), end); +#endif + } + + template + auto range(S start, E end, B blocking) { +#if defined(USE_BOOST_IRANGE) + return boost::irange(static_cast(start), end, static_cast(blocking) ); +#elif defined(USE_RANGES_TS) + // NOTE: + // iota(s) | slice(s,e) | stride(b) is faster than + // iota(s,e) | stride(b) for some reason. + return ranges::view::iota(static_cast(start)) | + ranges::view::slice(static_cast(start), end) | + ranges::view::stride(static_cast(blocking)); +#endif + } + +} // namespace prk + +#endif /* PRK_RANGES_H */ diff --git a/Cxx11/prk_simd.h b/Cxx11/prk_simd.h new file mode 100644 index 000000000..742bc4fcb --- /dev/null +++ b/Cxx11/prk_simd.h @@ -0,0 +1,52 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_SIMD_H +#define PRK_SIMD_H + +#define PRAGMA(x) _Pragma(#x) + +#if defined(__INTEL_COMPILER) +# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep) +// According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance +# define PRAGMA_INLINE PRAGMA(forceinline recursive) +#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) ) +# define PRAGMA_SIMD PRAGMA(GCC ivdep) +# define PRAGMA_INLINE PRAGMA(inline) +#elif defined(__clang__) +# define PRAGMA_SIMD PRAGMA(clang loop vectorize(assume_safety)) +# define PRAGMA_INLINE +#else +# define PRAGMA_SIMD +# define PRAGMA_INLINE +#endif + +#endif /* PRK_SIMD_H */ diff --git a/Cxx11/prk_tbb.h b/Cxx11/prk_tbb.h new file mode 100644 index 000000000..86abfd5c9 --- /dev/null +++ b/Cxx11/prk_tbb.h @@ -0,0 +1,55 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_TBB_H +#define PRK_TBB_H + +#ifdef USE_TBB +# include +# include +# include +# include +# if ( PRK_TBB_PARTITIONER == 1) +//# warning STATIC + tbb::static_partitioner tbb_partitioner; +# elif ( PRK_TBB_PARTITIONER == 2) +//# warning AFFINITY + tbb::affinity_partitioner tbb_partitioner; +# elif ( PRK_TBB_PARTITIONER == 3) +//# warning SIMPLE + tbb::simple_partitioner tbb_partitioner; +# else +//# warning AUTO + tbb::auto_partitioner tbb_partitioner; +# endif +#endif + +#endif /* PRK_TBB_H */ diff --git a/Cxx11/prk_thrust.h b/Cxx11/prk_thrust.h new file mode 100644 index 000000000..4ffd50c34 --- /dev/null +++ b/Cxx11/prk_thrust.h @@ -0,0 +1,50 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +#ifndef PRK_THRUST_H +#define PRK_THRUST_H + +#ifdef USE_THRUST +# ifdef __NVCC__ +# include +# endif +# include +# include +# include +# include +# include +# include +# include +# include +# include +#endif + +#endif /* PRK_THRUST_H */ diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 76363d8e2..2c0be683f 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -64,22 +64,6 @@ #include #include -template -const T prk_reduce(I first, I last, T init) { -#if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__) - return std::reduce(first, last, init); -#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) - return std::accumulate(first, last, init); -#else - // unreachable, but preserved as reference implementation - T r(0); - for (I i=first; i!=last; ++i) { - r += *i; - } - return r; -#endif -} - // These headers are busted with NVCC and GCC 5.4.0 // The header is busted with Cray C++ 8.6.1. #if !defined(__NVCC__) && !defined(_CRAYC) @@ -87,174 +71,39 @@ const T prk_reduce(I first, I last, T init) { #include #endif -#define PRAGMA(x) _Pragma(#x) +#include "prk_simd.h" -#ifdef _OPENMP -# include -# define OMP(x) PRAGMA(omp x) -# define OMP_PARALLEL(x) PRAGMA(omp parallel x) -# define OMP_PARALLEL_FOR_REDUCE(x) PRAGMA(omp parallel for reduction (x) ) -# define OMP_MASTER PRAGMA(omp master) -# define OMP_BARRIER PRAGMA(omp barrier) -# define OMP_FOR(x) PRAGMA(omp for x) -# define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) ) -// OpenMP SIMD if supported, else not. -# if (_OPENMP >= 201300) -# define OMP_SIMD PRAGMA(omp simd) -# define OMP_FOR_SIMD PRAGMA(omp for simd) -# define OMP_TASK(x) PRAGMA(omp task x) -# define OMP_TASKLOOP(x) PRAGMA(omp taskloop x ) -# if defined(__INTEL_COMPILER) -# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop x ) -# else -# define OMP_TASKLOOP_COLLAPSE(n,x) PRAGMA(omp taskloop collapse(n) x ) -# endif -# define OMP_TASKWAIT PRAGMA(omp taskwait) -# define OMP_ORDERED(x) PRAGMA(omp ordered x) -# define OMP_TARGET(x) PRAGMA(omp target x) -# define OMP_DECLARE_TARGET PRAGMA(omp declare target) -# define OMP_END_DECLARE_TARGET PRAGMA(omp end declare target) -# else -# define OMP_SIMD -# define OMP_FOR_SIMD PRAGMA(omp for) -# define OMP_TASK(x) -# define OMP_TASKLOOP(x) -# define OMP_TASKLOOP_COLLAPSE(n,x) -# define OMP_TASKWAIT -# define OMP_ORDERED(x) -# define OMP_TARGET(x) -# define OMP_DECLARE_TARGET -# define OMP_END_DECLARE_TARGET -# endif -#else -# define OMP(x) -# define OMP_PARALLEL(x) -# define OMP_PARALLEL_FOR_REDUCE(x) -# define OMP_MASTER -# define OMP_BARRIER -# define OMP_FOR(x) -# define OMP_FOR_REDUCE(x) -# define OMP_SIMD -# define OMP_FOR_SIMD -# define OMP_TASK(x) -# define OMP_TASKLOOP(x) -# define OMP_TASKLOOP_COLLAPSE(n,x) -# define OMP_TASKWAIT -# define OMP_ORDERED(x) -# define OMP_TARGET(x) -# define OMP_DECLARE_TARGET -# define OMP_END_DECLARE_TARGET +#ifdef USE_RANGES +# include "prk_ranges.h" #endif -#if defined(__INTEL_COMPILER) -# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep) -// According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance -# define PRAGMA_INLINE PRAGMA(forceinline recursive) -#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) ) -# define PRAGMA_SIMD PRAGMA(GCC ivdep) -# define PRAGMA_INLINE PRAGMA(inline) -#elif defined(__clang__) -# define PRAGMA_SIMD PRAGMA(clang loop vectorize(assume_safety)) -# define PRAGMA_INLINE -#else -# define PRAGMA_SIMD -# define PRAGMA_INLINE -#endif - -#ifdef USE_TBB -# include -# include -# include -# if ( PRK_TBB_PARTITIONER == 1) -//# warning STATIC - tbb::static_partitioner tbb_partitioner; -# elif ( PRK_TBB_PARTITIONER == 2) -//# warning AFFINITY - tbb::affinity_partitioner tbb_partitioner; -# elif ( PRK_TBB_PARTITIONER == 3) -//# warning SIMPLE - tbb::simple_partitioner tbb_partitioner; -# else -//# warning AUTO - tbb::auto_partitioner tbb_partitioner; -# endif -#endif - -#if defined(USE_RANGES) -# if defined(USE_BOOST_IRANGE) -# include "boost/range/irange.hpp" -# elif defined(USE_RANGES_TS) -# include "range/v3/view/iota.hpp" -# include "range/v3/view/slice.hpp" -# include "range/v3/view/stride.hpp" -# else -# error You have not provided a version of ranges to use. -# endif -#endif - -#if defined(USE_BOOST_COMPUTE) -# include "boost/compute.hpp" -# include "boost/compute/container/valarray.hpp" -#endif - -#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1800) -#define USE_INTEL_PSTL -#endif - -#ifdef USE_PSTL -# ifdef USE_INTEL_PSTL -# include -# include -# include -# include -# elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \ - ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) -# include -# include -# endif -#endif - -#ifdef USE_KOKKOS -# include -# include -# include -#endif - -#ifdef USE_RAJA -# define RAJA_ENABLE_NESTED 1 -# include "RAJA/RAJA.hpp" -#endif - -#ifdef USE_THRUST -# ifdef __NVCC__ -# include -# endif -# include -# include -# include -# include -# include -# include -# include -# include -# include -#endif - -#ifdef USE_SYCL -# include "CL/sycl.hpp" -#endif - -#ifdef USE_OCCA -# include "occa.hpp" +#ifdef USE_OPENMP +# include "prk_openmp.h" #endif #define RESTRICT __restrict__ namespace prk { + template + const T reduce(I first, I last, T init) { +#if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__) + return std::reduce(first, last, init); +#elif (defined(__cplusplus) && (__cplusplus >= 201103L)) + return std::accumulate(first, last, init); +#else + // unreachable, but preserved as reference implementation + T r(0); + for (I i=first; i!=last; ++i) { + r += *i; + } + return r; +#endif + } + static inline double wtime(void) { -#ifdef _OPENMP +#if defined(USE_OPENMP) && defined(_OPENMP) return omp_get_wtime(); #else using t = std::chrono::high_resolution_clock; @@ -271,29 +120,6 @@ namespace prk { return ( numerator / denominator + (numerator % denominator > 0) ); } - template - auto range(S start, E end) { -#if defined(USE_BOOST_IRANGE) - return boost::irange(static_cast(start), end); -#elif defined(USE_RANGES_TS) - return ranges::view::iota(static_cast(start), end); -#endif - } - - template - auto range(S start, E end, B blocking) { -#if defined(USE_BOOST_IRANGE) - return boost::irange(static_cast(start), end, static_cast(blocking) ); -#elif defined(USE_RANGES_TS) - // NOTE: - // iota(s) | slice(s,e) | stride(b) is faster than - // iota(s,e) | stride(b) for some reason. - return ranges::view::iota(static_cast(start)) | - ranges::view::slice(static_cast(start), end) | - ranges::view::stride(static_cast(blocking)); -#endif - } - } // namespace prk #endif /* PRK_UTIL_H */ diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc index d2eb5db2a..b92bd4a57 100644 --- a/Cxx11/stencil-kokkos.cc +++ b/Cxx11/stencil-kokkos.cc @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_kokkos.h" typedef Kokkos::View matrix; //typedef Kokkos::View matrix; diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 9989b5bdc..e42eaef50 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -60,6 +60,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "CL/sycl.hpp" + #include "prk_util.h" #include "stencil_sycl.hpp" diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index 8495032ca..6c14800af 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_pstl.h" // See ParallelSTL.md for important information. #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) #include "stencil_pstl.hpp" diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc index 3bcecb4ec..cff3421f3 100644 --- a/Cxx11/stencil-vector-raja.cc +++ b/Cxx11/stencil-vector-raja.cc @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_raja.h" // This must be before the stencil header, which uses this. #ifdef RAJA_ENABLE_OPENMP diff --git a/Cxx11/stencil-vector-tbb.cc b/Cxx11/stencil-vector-tbb.cc index 2f5c27488..81a252019 100644 --- a/Cxx11/stencil-vector-tbb.cc +++ b/Cxx11/stencil-vector-tbb.cc @@ -61,6 +61,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" #include "stencil_tbb.hpp" void nothing(const int n, const int t, std::vector & in, std::vector & out) diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc index 53066208b..11482700a 100644 --- a/Cxx11/transpose-host-thrust.cc +++ b/Cxx11/transpose-host-thrust.cc @@ -50,6 +50,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_thrust.h" int main(int argc, char * argv[]) { diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index aff072f53..fa35ebb6e 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -50,6 +50,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_kokkos.h" int main(int argc, char * argv[]) { diff --git a/Cxx11/transpose-occa.cc b/Cxx11/transpose-occa.cc index 5b05b73ce..888d6a230 100644 --- a/Cxx11/transpose-occa.cc +++ b/Cxx11/transpose-occa.cc @@ -51,6 +51,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "occa.hpp" + #include "prk_util.h" int main(int argc, char * argv[]) diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 5055374d2..1c8489806 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -49,6 +49,8 @@ /// ////////////////////////////////////////////////////////////////////// +#include "CL/sycl.hpp" + #include "prk_util.h" int main(int argc, char * argv[]) diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index 222322bd8..f51f76e7f 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -50,6 +50,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_pstl.h" // See ParallelSTL.md for important information. diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc index 84738694d..59b757eea 100644 --- a/Cxx11/transpose-vector-raja.cc +++ b/Cxx11/transpose-vector-raja.cc @@ -50,6 +50,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_raja.h" const int tile_size = 32; diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc index 45ea4bc5b..d154677fd 100644 --- a/Cxx11/transpose-vector-tbb.cc +++ b/Cxx11/transpose-vector-tbb.cc @@ -53,6 +53,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_tbb.h" int main(int argc, char * argv[]) { diff --git a/common/make.defs.cray b/common/make.defs.cray index aee737d77..b1c59b8f4 100644 --- a/common/make.defs.cray +++ b/common/make.defs.cray @@ -27,6 +27,10 @@ ORNLACCFLAG=-h acc # NERSC: "module load boost" BOOSTFLAG=-DUSE_BOOST -DUSE_BOOST_COMPUTE -I$${BOOST_DIR}/include # +# CBLAS for C++ DGEMM +# +CBLASFLAG= # LibSci likely included by default +# # MPI # # cc wraps gcc, icc or craycc, depending on your PrgEng module. diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 586cec08c..e4eda09f5 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -31,6 +31,7 @@ DEFAULT_OPT_FLAGS+=-Wall # OpenMP flags # OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd OFFLOADFLAG=-foffload="-O3 -v" ORNLACCFLAG=-fopenacc # @@ -48,7 +49,7 @@ OPENCLFLAG=-framework OpenCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include +SYCLFLAG=-I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -70,17 +71,16 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -#PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG} -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc -KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc -RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust -THRUSTFLAG=-DUSE_THRUST -I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP +THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.intel b/common/make.defs.intel index 556d940b6..00a781cac 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -29,6 +29,7 @@ DEFAULT_OPT_FLAGS+=-qopt-report=5 # OpenMP flags # OPENMPFLAG=-qopenmp +OPENMPSIMDFLAG=-qopenmp-simd OFFLOADFLAG=-qopenmp-offload=host # # OpenCL flags @@ -45,7 +46,7 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include +SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -54,7 +55,7 @@ SYCLFLAG=-DUSE_SYCL -std=gnu++14 -I$(SYCLDIR)/include # # OCCA # -OCCADIR=${HOME}/prk-repo/Cxx11/occa +#OCCADIR=${HOME}/prk-repo/Cxx11/occa # # Cilk # @@ -62,19 +63,21 @@ CILKFLAG=-intel-extensions # default # # TBB # -TBBFLAG=-DUSE_TBB -tbb -#TBBFLAG=-DUSE_TBB -tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE +TBBFLAG=-tbb +#TBBFLAG=-tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE # # Parallel STL, Boost, etc. # -#BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} ${RANGEFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/intel -KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/intel -RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +#THRUSTDIR=/opt/nvidia/thrust +#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # # CBLAS for C++ DGEMM # @@ -85,7 +88,7 @@ CBLASFLAG=-DMKL -mkl # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander #NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -NVCC=nvcc +NVCC=nvcc -arch=sm_50 CUDAFLAGS=-g -O3 -std=c++11 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 481a624ea..f4a54c4f8 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -34,8 +34,9 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math # OpenMP flags # OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd OFFLOADFLAG=-fopenmp -ORNLACCFLAG= # Flang does not support OpenACC +#ORNLACCFLAG= # Flang does not support OpenACC # Klondike weirdness # OPENMPFLAG+=-L/opt/intel/compilers_and_libraries_2018.0.082/linux/compiler/lib/intel64_lin -liomp5 # Mac weirdness @@ -75,8 +76,8 @@ SYCLFLAG+=-std=c++14 # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} -std=gnu++14 ${OPENMPFLAG} -SYCLFLAG=-DUSE_SYCL -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS) +SYCLCXX=${CXX} ${OPENMPFLAG} +SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS) # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -94,14 +95,16 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-DUSE_BOOST -I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/clang -KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang -RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +#THRUSTDIR=/opt/nvidia/thrust +#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # # CBLAS for C++ DGEMM # @@ -112,7 +115,7 @@ CBLASFLAG=-DACCELERATE -framework Accelerate # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -#NVCC=nvcc +#NVCC=nvcc -arch=sm_50 CUDAFLAGS=-g -O3 -std=c++11 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED From ec0919f6503bd72d7b79731636a655e7600bf5d4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 30 May 2018 22:11:14 -0700 Subject: [PATCH 096/245] add CBLAS transpose (#350) * add CBLAS transpose - MKL and Accelerate are supported via extensions - add Travis support - add 'cblas' target to Makefile * add CBLAS transpose to docs * remove Rob from default owners so errant code review requests stop happening * remove cblas_int * fix unrelated issue with homebrew ompi --- CODEOWNERS | 4 +- Cxx11/Makefile | 2 + Cxx11/dgemm-cblas.cc | 24 +++--- Cxx11/transpose-cblas.cc | 178 +++++++++++++++++++++++++++++++++++++++ README.md | 2 +- travis/build-run-prk.sh | 3 +- travis/install-mpi.sh | 2 + 7 files changed, 199 insertions(+), 16 deletions(-) create mode 100644 Cxx11/transpose-cblas.cc diff --git a/CODEOWNERS b/CODEOWNERS index dafe2ca29..6f426a040 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,7 +2,7 @@ # Each line is a file pattern followed by one or more owners. # These owners will be the default owners for everything in the repo. -* @jeffhammond @rfvander +* @jeffhammond # Order is important. The last matching pattern has the most precedence. # So if a pull request only touches javascript files, only these owners @@ -15,8 +15,8 @@ Cxx11/* @jeffhammond FENIX/* @rfvander @marcgamell FG_MPI/* @rfvander FORTRAN/* @jeffhammond -GRAPPA/* @nelsonje FORTRAN/*coarray.f90 @afanfa @zbeekman @jeffhammond +GRAPPA/* @nelsonje JULIA/* @kpamnany @jeffhammond LEGION/* @magnatelee @elliottslaughter @apokayi @rfvander MPI1/* @rfvander diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 652c423ba..255c706e2 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -124,6 +124,8 @@ cuda: transpose-cuda cublas: transpose-cublas nstream-cublas dgemm-cublas +cblas: transpose-cblas dgemm-cblas + occa: transpose-occa nstream-occa ornlacc: p2p-hyperplane-vector-ornlacc diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index 8390b7c11..61a9292fb 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -63,14 +63,14 @@ #if defined(MKL) #include -typedef MKL_INT cblas_int; +#ifdef MKL_ILP64 +#error Use the MKL library for 32-bit integers! +#endif #elif defined(ACCELERATE) // The location of cblas.h is not in the system include path when -framework Accelerate is provided. #include -typedef int cblas_int; #else #include -typedef int cblas_int; #endif #ifdef PRK_DEBUG @@ -95,7 +95,7 @@ void prk_dgemm(const int order, const std::vector & B, std::vector & C) { - const cblas_int n = order; + const int n = order; const double alpha = 1.0; const double beta = 1.0; @@ -108,7 +108,7 @@ void prk_dgemm(const int order, const int batches, const std::vector> & B, std::vector> & C) { - const cblas_int n = order; + const int n = order; const double alpha = 1.0; const double beta = 1.0; @@ -123,7 +123,7 @@ void prk_dgemm(const int order, const int batches, const int nt, const std::vector> & B, std::vector> & C) { - const cblas_int n = order; + const int n = order; const double alpha = 1.0; const double beta = 1.0; @@ -141,17 +141,17 @@ void prk_dgemm(const int order, const int batches, double** & B, double** & C) { - const cblas_int n = order; + const int n = order; const double alpha = 1.0; const double beta = 1.0; - const cblas_int group_count = 1; - const cblas_int group_size[group_count] = { batches }; + const int group_count = 1; + const int group_size[group_count] = { batches }; const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans }; const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans }; - const cblas_int n_array[group_count] = { n }; + const int n_array[group_count] = { n }; const double alpha_array[group_count] = { alpha }; const double beta_array[group_count] = { beta }; @@ -182,7 +182,7 @@ void prk_dgemm(const int order, const int batches, int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11 CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; + std::cout << "C++11/CBLAS Dense matrix-matrix multiplication: C += A x B" << std::endl; ////////////////////////////////////////////////////////////////////// /// Read and test input parameters @@ -193,7 +193,7 @@ int main(int argc, char * argv[]) int batches = 0; int batch_threads = 1; try { - if (argc < 2) { + if (argc < 3) { throw "Usage: <# iterations> [ ]"; } diff --git a/Cxx11/transpose-cblas.cc b/Cxx11/transpose-cblas.cc new file mode 100644 index 000000000..9f7f17b07 --- /dev/null +++ b/Cxx11/transpose-cblas.cc @@ -0,0 +1,178 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#if defined(MKL) +#include +#ifdef MKL_ILP64 +#error Use the MKL library for 32-bit integers! +#endif +#elif defined(ACCELERATE) +// The location of cblas.h is not in the system include path when -framework Accelerate is provided. +#include +#else +#include +#endif + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/CBLAS Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto trans_time = 0.0; + + std::vector A(order*order); + std::vector B(order*order,0.0); + std::vector T(order*order); + double one[1] = {1.0}; + + // fill A with the sequence 0 to order^2-1 as doubles + std::iota(A.begin(), A.end(), 0.0); + + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + // T = transpose(A) +#if defined(MKL) + mkl_domatcopy('R','T', order, order, 1.0, &(A[0]), order, &(T[0]), order); +#elif defined(ACCELERATE) + vDSP_mtransD(&(A[0]), 1, &(T[0]), 1, order, order); +#else +#warning No CBLAS transpose extension available! + for (auto i=0;i(ij)*(1.+iterations)+addit; + abserr += std::fabs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + diff --git a/README.md b/README.md index 8bd22e28a..7214b0a9e 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ f = see footnotes | RAJA | y | y | y | y | | | | CUDA | i | y | y | y | | | | CUBLAS | | | y | y | | y | -| CBLAS | | | | | | y | +| CBLAS | | | y | | | y | | OpenACC | y | | | | | | * [SYCL](http://sycl.tech/) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 00b4395e6..432592d05 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -344,7 +344,8 @@ case "$PRK_TARGET" in # C++11 with CBLAS if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then echo "CBLASFLAG=-DACCELERATE -framework Accelerate" >> common/make.defs - make -C $PRK_TARGET_PATH dgemm-cblas + make -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas + $PRK_TARGET_PATH/transpose-cblas 10 1024 $PRK_TARGET_PATH/dgemm-cblas 10 400 fi diff --git a/travis/install-mpi.sh b/travis/install-mpi.sh index 236f2f419..e1691c272 100755 --- a/travis/install-mpi.sh +++ b/travis/install-mpi.sh @@ -27,6 +27,8 @@ case "$os" in brew upgrade mpich || brew install mpich || true ;; openmpi) + brew upgrade gcc || brew install gcc || true + brew link --overwrite gcc || true brew upgrade openmpi || brew install openmpi || true ;; *) From 180230e096f4caab81d0c7f1255a6a2eb7cf522b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Jun 2018 20:39:46 -0700 Subject: [PATCH 097/245] use std instead of pstl namespace and abstract it away (#353) --- Cxx11/generate-cxx-stencil.py | 4 +-- Cxx11/nstream-vector-pstl.cc | 4 +-- Cxx11/p2p-hyperplane-vector-pstl.cc | 4 +-- Cxx11/prk_pstl.h | 1 + Cxx11/stencil-vector-pstl.cc | 10 ++++---- Cxx11/stencil_pstl.hpp | 40 ++++++++++++++--------------- Cxx11/transpose-vector-pstl.cc | 4 +-- 7 files changed, 34 insertions(+), 33 deletions(-) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index b3b573887..a4154e9d3 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -47,8 +47,8 @@ def codegen(src,pattern,stencil_size,radius,W,model): elif (model=='pstl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') - src.write(' std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) {\n') - src.write(' std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n') + src.write(' std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {\n') + src.write(' std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n') elif (model=='raja'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') #src.write(' RAJA::forallN>>\n') diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index 9eb18c78f..0bab633b0 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -127,7 +127,7 @@ int main(int argc, char * argv[]) { #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { + std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) #warning GNU parallel @@ -145,7 +145,7 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { + std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (size_t i) { diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc index e17412ac2..132b26a45 100644 --- a/Cxx11/p2p-hyperplane-vector-pstl.cc +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -139,7 +139,7 @@ int main(int argc, char* argv[]) const auto end = std::min(i,n)+1; auto range = prk::range(begin,end); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { + std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) { @@ -157,7 +157,7 @@ int main(int argc, char* argv[]) const auto end = std::min(i,nb+1)+1; auto range = prk::range(begin,end); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (auto j) { + std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (auto j) { diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h index 5c89d765f..11e0368bb 100644 --- a/Cxx11/prk_pstl.h +++ b/Cxx11/prk_pstl.h @@ -47,6 +47,7 @@ # include # include # endif +namespace exec = std::execution; #endif #endif /* PRK_PSTL_H */ diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index 6c14800af..a328b1420 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -183,8 +183,8 @@ int main(int argc, char* argv[]) // initialize the input and output arrays auto range = prk::range(0,n); #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) { - std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) { + std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { + std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { @@ -205,8 +205,8 @@ int main(int argc, char* argv[]) // Add constant to solution to force refresh of neighbor data, if any #if 0 #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) { - std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) { + std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { + std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { @@ -220,7 +220,7 @@ int main(int argc, char* argv[]) }); #else #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::transform( pstl::execution::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); + std::transform( exec::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); diff --git a/Cxx11/stencil_pstl.hpp b/Cxx11/stencil_pstl.hpp index 8713da4d8..03f24fcb5 100644 --- a/Cxx11/stencil_pstl.hpp +++ b/Cxx11/stencil_pstl.hpp @@ -1,7 +1,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { auto inside = prk::range(1,n-1); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 +in[(i+0)*n+(j+-1)] * -0.5 +in[(i+0)*n+(j+1)] * 0.5 @@ -12,8 +12,8 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(2,n-2); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 +in[(i+-1)*n+(j+0)] * -0.25 +in[(i+0)*n+(j+-2)] * -0.125 @@ -28,8 +28,8 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(3,n-3); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 +in[(i+-2)*n+(j+0)] * -0.0833333333333 +in[(i+-1)*n+(j+0)] * -0.166666666667 @@ -48,8 +48,8 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(4,n-4); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 +in[(i+-3)*n+(j+0)] * -0.0416666666667 +in[(i+-2)*n+(j+0)] * -0.0625 @@ -72,8 +72,8 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(5,n-5); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 +in[(i+-4)*n+(j+0)] * -0.025 +in[(i+-3)*n+(j+0)] * -0.0333333333333 @@ -100,8 +100,8 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(1,n-1); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 +in[(i+-1)*n+(j+0)] * -0.25 +in[(i+0)*n+(j+-1)] * -0.25 @@ -115,8 +115,8 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(2,n-2); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 +in[(i+-2)*n+(j+-1)] * -0.0208333333333 +in[(i+-2)*n+(j+0)] * -0.0208333333333 @@ -144,8 +144,8 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(3,n-3); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 +in[(i+-3)*n+(j+-2)] * -0.00555555555556 +in[(i+-3)*n+(j+-1)] * -0.00555555555556 @@ -195,8 +195,8 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(4,n-4); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 +in[(i+-4)*n+(j+-3)] * -0.00223214285714 +in[(i+-4)*n+(j+-2)] * -0.00223214285714 @@ -276,8 +276,8 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { auto inside = prk::range(5,n-5); - std::for_each( std::execution::par, std::begin(inside), std::end(inside), [&] (int i) { - std::for_each( std::execution::unseq, std::begin(inside), std::end(inside), [&] (int j) { + std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) { + std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) { out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 +in[(i+-5)*n+(j+-4)] * -0.00111111111111 +in[(i+-5)*n+(j+-3)] * -0.00111111111111 diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index f51f76e7f..e94172bd6 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -116,8 +116,8 @@ int main(int argc, char * argv[]) // transpose #if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( pstl::execution::par, std::begin(range), std::end(range), [&] (int i) { - std::for_each( pstl::execution::unseq, std::begin(range), std::end(range), [&] (int j) { + std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { + std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { From 3fb516a7a543f7c4d3485bea713b4f1bcacd62ac Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Jun 2018 20:39:59 -0700 Subject: [PATCH 098/245] Fix GCC-8 warnings (#352) * use std instead of pstl namespace and abstract it away * fix all valid GCC-8 compiler warnings --- FORTRAN/Makefile | 2 +- FORTRAN/dgemm-openmp-target.f90 | 2 +- FORTRAN/dgemm-pretty.f90 | 2 +- FORTRAN/dgemm-taskloop-openmp.f90 | 2 +- FORTRAN/dgemm.f90 | 2 +- FORTRAN/nstream-openmp-target.f90 | 12 +----------- FORTRAN/nstream-ornlacc.f90 | 2 +- FORTRAN/nstream-pretty.f90 | 3 +-- FORTRAN/nstream-taskloop-openmp.f90 | 2 +- FORTRAN/nstream.f90 | 6 +++++- FORTRAN/p2p-innerloop-ornlacc.f90 | 2 ++ FORTRAN/p2p-ornlacc.f90 | 1 - FORTRAN/transpose-openmp-target.f90 | 22 +++++++++++----------- 13 files changed, 27 insertions(+), 33 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 898a237c4..4d479881c 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -65,7 +65,7 @@ ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nst $(FC) $(FCFLAGS) $< -o $@ stencil: stencil.f90 stencil_serial.f90 - $(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o + #$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o $(FC) $(FCFLAGS) $< -o $@ %-pretty: %-pretty.f90 diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90 index a3b5a6e41..ed2193bba 100644 --- a/FORTRAN/dgemm-openmp-target.f90 +++ b/FORTRAN/dgemm-openmp-target.f90 @@ -197,7 +197,7 @@ program main if (residuum .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = dgemm_time/iterations - nflops = 2 * forder**3 + nflops = 2 * int(order,INT64)**3 write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, & ' Avg time (s): ', avgtime else diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.f90 index 650aa4243..e1e6ac7c2 100644 --- a/FORTRAN/dgemm-pretty.f90 +++ b/FORTRAN/dgemm-pretty.f90 @@ -178,7 +178,7 @@ program main if (residuum .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = dgemm_time/iterations - nflops = 2 * forder**3 + nflops = 2 * int(order,INT64)**3 write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, & ' Avg time (s): ', avgtime else diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.f90 index 67c1a3884..b127dd356 100644 --- a/FORTRAN/dgemm-taskloop-openmp.f90 +++ b/FORTRAN/dgemm-taskloop-openmp.f90 @@ -252,7 +252,7 @@ program main if (residuum .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = dgemm_time/iterations - nflops = 2 * forder**3 + nflops = 2 * int(order,INT64)**3 write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, & ' Avg time (s): ', avgtime else diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90 index dd7d18aaa..5f678c981 100644 --- a/FORTRAN/dgemm.f90 +++ b/FORTRAN/dgemm.f90 @@ -304,7 +304,7 @@ program main if (residuum .lt. epsilon) then write(*,'(a)') 'Solution validates' avgtime = dgemm_time/iterations - nflops = 2 * forder**3 + nflops = 2 * int(order,INT64)**3 write(*,'(a,f13.6,a,f10.6)') 'Rate (MF/s): ',(1.d-6*nflops)/avgtime, & ' Avg time (s): ', avgtime else diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90 index 96c4b1679..954a86b1e 100644 --- a/FORTRAN/nstream-openmp-target.f90 +++ b/FORTRAN/nstream-openmp-target.f90 @@ -62,20 +62,10 @@ ! ! ******************************************************************* -function prk_get_wtime() result(t) - use iso_fortran_env - implicit none - real(kind=REAL64) :: t - integer(kind=INT64) :: c, r - call system_clock(count = c, count_rate = r) - t = real(c,REAL64) / real(r,REAL64) -end function prk_get_wtime - program main use iso_fortran_env use omp_lib implicit none - real(kind=REAL64) :: prk_get_wtime ! for argument parsing integer :: err integer :: arglen @@ -228,7 +218,7 @@ program main else write(*,'(a17)') 'Solution validates' avgtime = nstream_time/iterations; - bytes = 4.0 * int(length,INT64) * storage_size(A)/8 + bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.f90 index 033dee814..5769d4e35 100644 --- a/FORTRAN/nstream-ornlacc.f90 +++ b/FORTRAN/nstream-ornlacc.f90 @@ -220,7 +220,7 @@ program main else write(*,'(a17)') 'Solution validates' avgtime = nstream_time/iterations; - bytes = 4.0 * int(length,INT64) * storage_size(A)/8 + bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.f90 index a15e365ec..e6c9038fb 100644 --- a/FORTRAN/nstream-pretty.f90 +++ b/FORTRAN/nstream-pretty.f90 @@ -88,7 +88,6 @@ program main real(kind=REAL64) :: scalar integer(kind=INT64) :: bytes ! runtime variables - integer(kind=INT64) :: i integer(kind=INT32) :: k real(kind=REAL64) :: asum, ar, br, cr, ref real(kind=REAL64) :: t0, t1, nstream_time, avgtime @@ -205,7 +204,7 @@ program main else write(*,'(a17)') 'Solution validates' avgtime = nstream_time/iterations; - bytes = 4.0 * int(length,INT64) * storage_size(A)/8 + bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.f90 index 636e45d73..65d8fd056 100644 --- a/FORTRAN/nstream-taskloop-openmp.f90 +++ b/FORTRAN/nstream-taskloop-openmp.f90 @@ -226,7 +226,7 @@ program main else write(*,'(a17)') 'Solution validates' avgtime = nstream_time/iterations; - bytes = 4.0 * int(length,INT64) * storage_size(A)/8 + bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90 index 9d35024b5..6aa9c1529 100644 --- a/FORTRAN/nstream.f90 +++ b/FORTRAN/nstream.f90 @@ -62,6 +62,7 @@ ! ! ******************************************************************* +#ifndef _OPENMP function prk_get_wtime() result(t) use iso_fortran_env implicit none @@ -70,6 +71,7 @@ function prk_get_wtime() result(t) call system_clock(count = c, count_rate = r) t = real(c,REAL64) / real(r,REAL64) end function prk_get_wtime +#endif program main use iso_fortran_env @@ -77,7 +79,9 @@ program main use omp_lib #endif implicit none +#ifndef _OPENMP real(kind=REAL64) :: prk_get_wtime +#endif ! for argument parsing integer :: err integer :: arglen @@ -288,7 +292,7 @@ program main else write(*,'(a17)') 'Solution validates' avgtime = nstream_time/iterations; - bytes = 4.0 * int(length,INT64) * storage_size(A)/8 + bytes = 4 * int(length,INT64) * storage_size(A)/8 write(*,'(a12,f15.3,1x,a12,e15.6)') & 'Rate (MB/s): ', 1.d-6*bytes/avgtime, & 'Avg time (s): ', avgtime diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.f90 index 32c24a4d4..ee35ca58a 100644 --- a/FORTRAN/p2p-innerloop-ornlacc.f90 +++ b/FORTRAN/p2p-innerloop-ornlacc.f90 @@ -132,6 +132,8 @@ program main stop 1 endif + t0 = 0; + do j=1,n do i=1,n grid(i,j) = 0.0d0 diff --git a/FORTRAN/p2p-ornlacc.f90 b/FORTRAN/p2p-ornlacc.f90 index 6a9a97e23..18ee965e2 100644 --- a/FORTRAN/p2p-ornlacc.f90 +++ b/FORTRAN/p2p-ornlacc.f90 @@ -78,7 +78,6 @@ program main real(kind=REAL64), allocatable :: grid(:,:) ! array holding grid values ! runtime variables integer(kind=INT32) :: i, j, k - integer :: me, nt real(kind=REAL64) :: t0, t1, pipeline_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.f90 index b226f89af..1da28346a 100644 --- a/FORTRAN/transpose-openmp-target.f90 +++ b/FORTRAN/transpose-openmp-target.f90 @@ -67,7 +67,7 @@ program main integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables integer(kind=INT32) :: i, j, k - integer(kind=INT32) :: it, jt, tile_size + !integer(kind=INT32) :: it, jt, tile_size real(kind=REAL64) :: abserr, addit, temp ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -102,16 +102,16 @@ program main endif ! same default as the C implementation - tile_size = 32 - if (command_argument_count().gt.2) then - call get_command_argument(3,argtmp,arglen,err) - if (err.eq.0) read(argtmp,'(i32)') tile_size - endif - if ((tile_size .lt. 1).or.(tile_size.gt.order)) then - write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& - ' must be >= 1 and <= ',order - tile_size = order ! no tiling - endif + !tile_size = 32 + !if (command_argument_count().gt.2) then + ! call get_command_argument(3,argtmp,arglen,err) + ! if (err.eq.0) read(argtmp,'(i32)') tile_size + !endif + !if ((tile_size .lt. 1).or.(tile_size.gt.order)) then + ! write(*,'(a,i5,a,i5)') 'WARNING: tile_size ',tile_size,& + ! ' must be >= 1 and <= ',order + ! tile_size = order ! no tiling + !endif ! ******************************************************************** ! ** Allocate space for the input and transpose matrix From 505328db16d543453fa465cf9b32861adb07b4f7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Jun 2018 20:48:33 -0700 Subject: [PATCH 099/245] deprecate Cilk (#357) remove Cilk from default build --- C1z/Makefile | 2 +- common/make.defs.gcc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index a636e7ca9..0df8225c1 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -22,7 +22,7 @@ ORNLACCFLAGS = $(ORNLACCFLAG) CILKFLAGS = $(CILKFLAG) ISPCFLAGS = $(ISPCFLAG) -.PHONY: all clean serial thread openmp target taskloop cilk ispc +.PHONY: all clean serial thread openmp target taskloop ispc # cilk EXTRA= ifeq ($(shell uname -s),Darwin) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index e4eda09f5..732083da1 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -62,7 +62,7 @@ SYCLFLAG=-I$(SYCLDIR)/include # # Cilk # -CILKFLAG=-fcilkplus +#CILKFLAG=-fcilkplus # # TBB # From a03e2973adce711a6e65b539fb5bc465a04f53e0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Jun 2018 21:46:44 -0700 Subject: [PATCH 100/245] Split C++ headers (#356) * refactor headers * remove unnecessary preprocessor token * relocate preprocess tokens * update make.def examples * further localize include files where needed * use std instead of pstl namespace and abstract it away * move thread/future includes into relevant sources --- Cxx11/prk_util.h | 7 ------- Cxx11/transpose-vector-async.cc | 7 +++++++ Cxx11/transpose-vector-thread.cc | 7 +++++++ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 2c0be683f..321f91c8c 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -64,13 +64,6 @@ #include #include -// These headers are busted with NVCC and GCC 5.4.0 -// The header is busted with Cray C++ 8.6.1. -#if !defined(__NVCC__) && !defined(_CRAYC) -#include -#include -#endif - #include "prk_simd.h" #ifdef USE_RANGES diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc index 8f285b1ad..c68b8c463 100644 --- a/Cxx11/transpose-vector-async.cc +++ b/Cxx11/transpose-vector-async.cc @@ -54,6 +54,13 @@ #include "prk_util.h" +// These headers are busted with NVCC and GCC 5.4.0 +// The header is busted with Cray C++ 8.6.1. +#if !defined(__NVCC__) && !defined(_CRAYC) +#include +#include +#endif + int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc index 57fbf11ea..44071ca95 100644 --- a/Cxx11/transpose-vector-thread.cc +++ b/Cxx11/transpose-vector-thread.cc @@ -54,6 +54,13 @@ #include "prk_util.h" +// These headers are busted with NVCC and GCC 5.4.0 +// The header is busted with Cray C++ 8.6.1. +#if !defined(__NVCC__) && !defined(_CRAYC) +#include +#include +#endif + int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; From 661463486d3a6c87b460398ece6942c4e5acd684 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Jun 2018 21:49:41 -0700 Subject: [PATCH 101/245] C++ p2p blocked doacross (#355) * block doacross like tasks to make it suck less --- Cxx11/p2p-doacross-vector-openmp.cc | 51 ++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 11 deletions(-) diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-vector-openmp.cc index 2d271c92b..37b9802f0 100644 --- a/Cxx11/p2p-doacross-vector-openmp.cc +++ b/Cxx11/p2p-doacross-vector-openmp.cc @@ -60,11 +60,16 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "p2p-kernel.h" int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; +#ifdef _OPENMP std::cout << "C++11/OpenMP DOACROSS pipeline execution on 2D grid" << std::endl; +#else + std::cout << "C++11/Serial pipeline execution on 2D grid" << std::endl; +#endif ////////////////////////////////////////////////////////////////////// // Process and test input parameters @@ -92,15 +97,27 @@ int main(int argc, char* argv[]) } else if ( static_cast(m)*static_cast(n) > INT_MAX) { throw "ERROR: grid dimension too large - overflow risk"; } + + // grid chunk dimensions + mc = (argc > 4) ? std::atoi(argv[4]) : m; + nc = (argc > 5) ? std::atoi(argv[5]) : n; + if (mc < 1 || mc > m || nc < 1 || nc > n) { + std::cout << "WARNING: grid chunk dimensions invalid: " << mc << nc << " (ignoring)" << std::endl; + mc = m; + nc = n; + } } catch (const char * e) { std::cout << e << std::endl; return 1; } +#ifdef _OPENMP std::cout << "Number of threads (max) = " << omp_get_max_threads() << std::endl; +#endif std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Grid sizes = " << m << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << mc << ", " << nc << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -113,13 +130,12 @@ int main(int argc, char* argv[]) OMP_PARALLEL() { OMP_FOR() - for (auto i=0; i Date: Sun, 3 Jun 2018 22:29:08 -0700 Subject: [PATCH 102/245] Travis Clang OpenMP (#358) * attempt to use Clang OpenMP @ Mac in Travis --- travis/build-run-prk.sh | 57 ++++++++++++++++++++++++++--------------- travis/install-clang.sh | 26 ++----------------- travis/install-deps.sh | 2 +- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 432592d05..0b0827729 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -265,9 +265,9 @@ case "$PRK_TARGET" in case $CXX in g++) if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then - for version in "-9" "-8" "-7" "-6" "-5" "" ; do + for version in "9" "8" "7" "6" "5" "" ; do if [ -f "`which /usr/local/opt/gcc@${version}/bin/g++-${version}`" ]; then - export PRK_CXX="`which /usr/local/opt/llvm@${version}/bin/clang++`" + export PRK_CXX="`which /usr/local/opt/gcc@${version}/bin/g++-${version}`" echo "Found C++: $PRK_CXX" break fi @@ -289,16 +289,16 @@ case "$PRK_TARGET" in clang++) # Homebrew does not always place the best/latest Clang/LLVM in the default path if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "x$PRK_CXX" = "x" ] ; then - for version in "" "4.1" "4" "4.0" "-3.9" "-3.8" "-3.7" "-3.6" ; do - if [ -f "`which /usr/local/opt/llvm@${version}/bin/clang++`" ]; then - export PRK_CXX="`which /usr/local/opt/llvm@${version}/bin/clang++`" + for version in "" "@6" "@5" "@4" ; do + if [ -f "`which /usr/local/opt/llvm${version}/bin/clang++`" ]; then + export PRK_CXX="`which /usr/local/opt/llvm${version}/bin/clang++`" echo "Found C++: $PRK_CXX" break fi done fi if [ "x$PRK_CXX" = "x" ] ; then - for version in "-5" "-4.1" "-4" "-4.0" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do + for version in "-6" "-5" "-4.1" "-4" "-4.0" "-3.9" "-3.8" "-3.7" "-3.6" "" ; do if [ -f "`which ${CXX}${version}`" ]; then export PRK_CXX="${CXX}${version}" echo "Found C++: $PRK_CXX" @@ -392,20 +392,37 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc 10 1024 64 ;; clang) - # Host - echo "Skipping Clang since OpenMP support probably missing" - #echo "OPENMPFLAG=-fopenmp" >> common/make.defs - #make -C $PRK_TARGET_PATH openmp - #$PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - #$PRK_TARGET_PATH/stencil-vector-openmp 10 1000 - #$PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 - #$PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 - #echo "Test stencil code generator" - #for s in star grid ; do - # for r in 1 2 3 4 5 ; do - # $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r - # done - #done + if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then + # Host + echo "OPENMPFLAG=-fopenmp" >> common/make.defs + make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \ + transpose-vector-openmp nstream-vector-openmp + $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 + $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 64 + $PRK_TARGET_PATH/stencil-vector-openmp 10 1000 + $PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 + $PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 + #echo "Test stencil code generator" + for s in star grid ; do + for r in 1 2 3 4 5 ; do + $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + done + done + # Offload + #echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs + #make -C $PRK_TARGET_PATH target + #$PRK_TARGET_PATH/stencil-openmp-target 10 1000 + #$PRK_TARGET_PATH/transpose-openmp-target 10 1024 32 + ##echo "Test stencil code generator" + #for s in star grid ; do + # for r in 1 2 3 4 5 ; do + # $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + # done + #done + else + echo "Skipping Clang since OpenMP support probably missing" + fi ;; icc) # Host diff --git a/travis/install-clang.sh b/travis/install-clang.sh index a4509c0db..6b2178e7d 100755 --- a/travis/install-clang.sh +++ b/travis/install-clang.sh @@ -4,7 +4,6 @@ set -e set -x TRAVIS_ROOT="$1" -CLANG_VERSION="$2" if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then os=`uname` @@ -12,29 +11,8 @@ if [ "${CC}" = "clang" ] || [ "${CXX}" = "clang++" ] ; then Darwin) echo "Mac" brew update - case "$CLANG_VERSION" in - omp) - brew install clang-omp || brew upgrade clang-omp - #brew test clang-omp - # make sure that these are found before the system installation - # there are less evil but less local ways to impart this effect - if [ ! -d "$TRAVIS_ROOT/bin" ]; then - mkdir -p $TRAVIS_ROOT/bin - fi - # we should refer to clang-omp* explicitly so know it exists and works - ln -s `which clang-omp` $TRAVIS_ROOT/bin/clang - ln -s `which clang-omp++` $TRAVIS_ROOT/bin/clang++ - ;; - 3*) - #brew install llvm$CLANG_VERSION --with-clang --with-compiler-rt --with-libcxx --with-lld --without-assertions - brew install llvm@$CLANG_VERSION || brew upgrade llvm@$CLANG_VERSION - #brew test llvm@$CLANG_VERSION - ;; - *) - echo "Unsupported version of Clang" - echo "Travis will continue and use the system default" - ;; - esac + brew install llvm || brew upgrade llvm || true + brew install libomp || brew upgrade libomp || true ;; Linux) echo "Linux Clang/LLVM builds not supported!" diff --git a/travis/install-deps.sh b/travis/install-deps.sh index a82df34cc..89243cd93 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -57,7 +57,7 @@ case "$PRK_TARGET" in sh ./travis/install-gcc.sh $TRAVIS_ROOT fi if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "${CC}" = "clang" ] ; then - sh ./travis/install-clang.sh $TRAVIS_ROOT 3.9 + sh ./travis/install-clang.sh $TRAVIS_ROOT fi sh ./travis/install-tbb.sh $TRAVIS_ROOT sh ./travis/install-pstl.sh $TRAVIS_ROOT From 8601c7fb7efbd0b50ca0d0073af40cfb6137fea1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Jun 2018 09:41:39 -0700 Subject: [PATCH 103/245] Raja views (#359) * add RAJA views+ranges nstream homogenize with existing implementation * add RAJA transpose with views and ranges homogenize with existing version * add RAJA stencil with views and ranges * add RAJA views to Travis * use RAJA develop branch * add RAJA views version of p2p * add RAJA p2p to Travis * make work sans OpenMP --- .gitignore | 3 + Cxx11/Makefile | 3 +- Cxx11/generate-cxx-stencil.py | 27 ++- Cxx11/nstream-raja.cc | 190 ++++++++++++++++ Cxx11/nstream-vector-raja.cc | 8 +- Cxx11/p2p-raja.cc | 185 ++++++++++++++++ Cxx11/p2p-vector-raja.cc | 4 +- Cxx11/prk_raja.h | 11 + Cxx11/stencil-raja.cc | 245 +++++++++++++++++++++ Cxx11/stencil-vector-raja.cc | 9 - Cxx11/stencil_kokkos.hpp | 20 +- Cxx11/stencil_pgnu.hpp | 40 ++-- Cxx11/stencil_pstl.hpp | 40 ++-- Cxx11/stencil_raja.hpp | 40 ++-- Cxx11/stencil_rajaview.hpp | 385 +++++++++++++++++++++++++++++++++ Cxx11/stencil_stl.hpp | 40 ++-- Cxx11/transpose-raja.cc | 186 ++++++++++++++++ Cxx11/transpose-vector-raja.cc | 11 +- travis/build-run-prk.sh | 13 +- travis/install-raja.sh | 3 +- 20 files changed, 1338 insertions(+), 125 deletions(-) create mode 100644 Cxx11/nstream-raja.cc create mode 100644 Cxx11/p2p-raja.cc create mode 100644 Cxx11/stencil-raja.cc create mode 100644 Cxx11/stencil_rajaview.hpp create mode 100644 Cxx11/transpose-raja.cc diff --git a/.gitignore b/.gitignore index 91cb027fc..099e56f2a 100644 --- a/.gitignore +++ b/.gitignore @@ -145,6 +145,7 @@ Cxx11/nstream-vector Cxx11/nstream-vector-openmp Cxx11/nstream-vector-pstl Cxx11/nstream-vector-raja +Cxx11/nstream-raja Cxx11/nstream-vector-rangefor Cxx11/nstream-vector-stl Cxx11/nstream-vector-taskloop @@ -164,6 +165,7 @@ Cxx11/stencil-vector-cilk Cxx11/stencil-vector-stl Cxx11/stencil-vector-pstl Cxx11/stencil-vector-raja +Cxx11/stencil-raja Cxx11/stencil-vector-rangefor Cxx11/stencil-vector-tbb Cxx11/stencil-vector-taskloop @@ -182,6 +184,7 @@ Cxx11/transpose-vector-cilk Cxx11/transpose-vector-stl Cxx11/transpose-vector-pstl Cxx11/transpose-vector-raja +Cxx11/transpose-raja Cxx11/transpose-vector-rangefor Cxx11/transpose-vector-tbb Cxx11/transpose-vector-taskloop diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 255c706e2..d1223b894 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -113,7 +113,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range kokkos: stencil-kokkos transpose-kokkos nstream-kokkos -raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja +raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ + p2p-raja stencil-raja transpose-raja nstream-raja cuda: stencil-cuda transpose-cuda nstream-cuda diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index a4154e9d3..e2ec18d37 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -57,6 +57,11 @@ def codegen(src,pattern,stencil_size,radius,W,model): #src.write(' [&](RAJA::Index_type i, RAJA::Index_type j) {\n') src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n') src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n') + elif (model=='rajaview'): + src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') + src.write(' RAJA::RangeSegment inner1('+str(radius)+',n-'+str(radius)+');\n') + src.write(' auto inner2 = RAJA::make_tuple(inner1, inner1);\n') + src.write(' RAJA::kernel(inner2, [=](int i, int j) {\n') elif (model=='tbb'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' tbb::blocked_range2d range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n') @@ -80,7 +85,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' for (auto i=it; i0 and k > > >;') # src.write('OMP( declare target )\n\n') for pattern in ['star','grid']: for r in range(1,6): diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc new file mode 100644 index 000000000..c98dae978 --- /dev/null +++ b/Cxx11/nstream-raja.cc @@ -0,0 +1,190 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_raja.h" + +#if defined(RAJA_ENABLE_OPENMP) + typedef RAJA::omp_parallel_for_exec thread_exec; +#elif defined(RAJA_ENABLE_TBB) + typedef RAJA::tbb_for_exec thread_exec; +#else +#warning No OpenMP! + typedef RAJA::seq_exec thread_exec; +#endif + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/RAJA STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time(0); + + double * RESTRICT Amem = new double[length]; + double * RESTRICT Bmem = new double[length]; + double * RESTRICT Cmem = new double[length]; + + RAJA::View> A(Amem, length); + RAJA::View> B(Bmem, length); + RAJA::View> C(Cmem, length); + + RAJA::RangeSegment range(0, length); + + double scalar(3); + + { + RAJA::forall(range, [=](RAJA::Index_type i) { + A(i) = 0.0; + B(i) = 2.0; + C(i) = 2.0; + }); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + RAJA::forall(range, [=](RAJA::Index_type i) { + A(i) += B(i) + scalar * C(i); + }); + } + nstream_time = prk::wtime() - nstream_time; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + double ar(0); + double br(2); + double cr(2); + for (int i=0; i<=iterations; i++) { + ar += br + scalar * cr; + } + + ar *= length; + + RAJA::ReduceSum reduced_asum(0.0); + RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + reduced_asum += std::fabs(A(i)); + }); + double asum(reduced_asum); + + double epsilon=1.e-8; + if (std::fabs(ar-asum)/asum > epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc index 62f92832f..ee3986e50 100644 --- a/Cxx11/nstream-vector-raja.cc +++ b/Cxx11/nstream-vector-raja.cc @@ -118,13 +118,13 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto nstream_time = 0.0; + double nstream_time(0); std::vector A(length); std::vector B(length); std::vector C(length); - double scalar = 3.0; + double scalar(3); { RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { @@ -133,7 +133,7 @@ int main(int argc, char * argv[]) C[i] = 2.0; }); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) nstream_time = prk::wtime(); @@ -151,7 +151,7 @@ int main(int argc, char * argv[]) double ar(0); double br(2); double cr(2); - for (auto i=0; i<=iterations; i++) { + for (int i=0; i<=iterations; i++) { ar += br + scalar * cr; } diff --git a/Cxx11/p2p-raja.cc b/Cxx11/p2p-raja.cc new file mode 100644 index 000000000..202d9b6b6 --- /dev/null +++ b/Cxx11/p2p-raja.cc @@ -0,0 +1,185 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_raja.h" + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/RAJA pipeline execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int m, n; + int mc, nc; + try { + if (argc < 4){ + throw " <# iterations> [ ]"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + m = std::atoi(argv[2]); + n = std::atoi(argv[3]); + if (m < 1 || n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(m)*static_cast(n) > INT_MAX) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + mc = (argc > 4) ? std::atoi(argv[4]) : m; + nc = (argc > 5) ? std::atoi(argv[5]) : n; + if (mc < 1 || mc > m || nc < 1 || nc > n) { + std::cout << "WARNING: grid chunk dimensions invalid: " << mc << nc << " (ignoring)" << std::endl; + mc = m; + nc = n; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << m << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << mc << ", " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; // silence compiler warning + + double * RESTRICT Amem = new double[m*n]; + matrix grid(Amem, m, n); + + for (int i=0; i(j); + } + for (int i=0; i(i); + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + for (int j=1; j(range, [=](RAJA::Index_type i) { + auto x = i; + auto y = j-i+1; + grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1); + }); + } + for (int j=n-2; j>=1; j--) { + RAJA::RangeSegment range(1, j+1); + RAJA::forall(range, [=](RAJA::Index_type i) { + auto x = n+i-j-1; + auto y = n-i; + grid(x,y) = grid(x-1,y) + grid(x,y-1) - grid(x-1,y-1); + }); + } + grid(0,0) = -grid(m-1,n-1); + } + + pipeline_time = prk::wtime() - pipeline_time; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.e-8; + auto corner_val = ((iterations+1.)*(n+m-2.)); + if ( (std::fabs(grid(m-1,n-1) - corner_val)/corner_val) > epsilon) { + std::cout << "ERROR: checksum " << grid(m-1,n-1) + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/Cxx11/p2p-vector-raja.cc b/Cxx11/p2p-vector-raja.cc index 00164aa94..e4faddccc 100644 --- a/Cxx11/p2p-vector-raja.cc +++ b/Cxx11/p2p-vector-raja.cc @@ -150,14 +150,14 @@ int main(int argc, char* argv[]) }); #else for (auto j=1; j(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { auto x = i; auto y = j-i+1; grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; }); } for (auto j=n-2; j>=1; j--) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { auto x = n+i-j-1; auto y = n-i; grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; diff --git a/Cxx11/prk_raja.h b/Cxx11/prk_raja.h index fb0bb25b8..9a8fdab0e 100644 --- a/Cxx11/prk_raja.h +++ b/Cxx11/prk_raja.h @@ -37,4 +37,15 @@ # include "RAJA/RAJA.hpp" #endif +#ifdef RAJA_ENABLE_OPENMP + typedef RAJA::omp_parallel_for_exec thread_exec; + typedef RAJA::omp_reduce reduce_exec; +#else + #warning No RAJA support for OpenMP! + typedef RAJA::seq_exec thread_exec; + typedef RAJA::seq_reduce reduce_exec; +#endif + +typedef RAJA::View> matrix; + #endif /* PRK_RAJA_H */ diff --git a/Cxx11/stencil-raja.cc b/Cxx11/stencil-raja.cc new file mode 100644 index 000000000..5fa333bce --- /dev/null +++ b/Cxx11/stencil-raja.cc @@ -0,0 +1,245 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_raja.h" +#include "stencil_rajaview.hpp" + +void nothing(const int n, const int t, matrix & in, matrix & out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + // n will never be zero - this is to silence compiler warnings. + //if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; + std::abort(); +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/RAJA Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + double * RESTRICT imem = new double[n*n]; + double * RESTRICT omem = new double[n*n]; + + RAJA::View> in(imem, n, n); + RAJA::View> out(omem, n, n); + + using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec, + RAJA::statement::For<1, RAJA::simd_exec, + RAJA::statement::Lambda<0> > > >; + using permute_policy = RAJA::KernelPolicy< RAJA::statement::For<1, thread_exec, + RAJA::statement::For<0, RAJA::simd_exec, + RAJA::statement::Lambda<0> > > >; + + RAJA::RangeSegment range(0, n); + auto grid = RAJA::make_tuple(range, range); + + RAJA::kernel(grid, [=](int i, int j) { + in(i,j) = static_cast(i+j); + out(i,j) = 0.0; + }); + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + // Apply the stencil operator + stencil(n, tile_size, in, out); + // Add constant to solution to force refresh of neighbor data, if any + RAJA::kernel(grid, [=](int i, int j) { + in(i,j) += 1.0; + }); + } + + stencil_time = prk::wtime() - stencil_time; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + // compute L1 norm in parallel +#if 0 + // This leads to incorrect computation of the norm. + RAJA::ReduceSum reduced_norm(0.0); + RAJA::forallN>> +#else + RAJA::ReduceSum reduced_norm(0.0); + RAJA::forallN>> +#endif + ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius), + [&](RAJA::Index_type i, RAJA::Index_type j) { + reduced_norm += std::fabs(out(i,j)); + }); + double norm = reduced_norm / active_points; + + // verify correctness + const double epsilon = 1.0e-8; + double reference_norm = 2.*(iterations+1.); + if (std::fabs(norm-reference_norm) > epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc index cff3421f3..822a45c00 100644 --- a/Cxx11/stencil-vector-raja.cc +++ b/Cxx11/stencil-vector-raja.cc @@ -62,15 +62,6 @@ #include "prk_util.h" #include "prk_raja.h" - -// This must be before the stencil header, which uses this. -#ifdef RAJA_ENABLE_OPENMP - typedef RAJA::omp_parallel_for_exec thread_exec; -#else -#warning No OpenMP! - typedef RAJA::seq_exec thread_exec; -#endif - #include "stencil_raja.hpp" void nothing(const int n, const int t, std::vector & in, std::vector & out) diff --git a/Cxx11/stencil_kokkos.hpp b/Cxx11/stencil_kokkos.hpp index cb5009aae..94fffab58 100644 --- a/Cxx11/stencil_kokkos.hpp +++ b/Cxx11/stencil_kokkos.hpp @@ -5,7 +5,7 @@ void star1(const int n, const int t, matrix & in, matrix & out) { +in(i+0,j+-1) * -0.5 +in(i+0,j+1) * 0.5 +in(i+1,j+0) * 0.5; - }); + }); } void star2(const int n, const int t, matrix & in, matrix & out) { @@ -19,7 +19,7 @@ void star2(const int n, const int t, matrix & in, matrix & out) { +in(i+0,j+2) * 0.125 +in(i+1,j+0) * 0.25 +in(i+2,j+0) * 0.125; - }); + }); } void star3(const int n, const int t, matrix & in, matrix & out) { @@ -37,7 +37,7 @@ void star3(const int n, const int t, matrix & in, matrix & out) { +in(i+1,j+0) * 0.166666666667 +in(i+2,j+0) * 0.0833333333333 +in(i+3,j+0) * 0.0555555555556; - }); + }); } void star4(const int n, const int t, matrix & in, matrix & out) { @@ -59,7 +59,7 @@ void star4(const int n, const int t, matrix & in, matrix & out) { +in(i+2,j+0) * 0.0625 +in(i+3,j+0) * 0.0416666666667 +in(i+4,j+0) * 0.03125; - }); + }); } void star5(const int n, const int t, matrix & in, matrix & out) { @@ -85,7 +85,7 @@ void star5(const int n, const int t, matrix & in, matrix & out) { +in(i+3,j+0) * 0.0333333333333 +in(i+4,j+0) * 0.025 +in(i+5,j+0) * 0.02; - }); + }); } void grid1(const int n, const int t, matrix & in, matrix & out) { @@ -98,7 +98,7 @@ void grid1(const int n, const int t, matrix & in, matrix & out) { +in(i+1,j+0) * 0.25 +in(i+1,j+1) * 0.25 ; - }); + }); } void grid2(const int n, const int t, matrix & in, matrix & out) { @@ -125,7 +125,7 @@ void grid2(const int n, const int t, matrix & in, matrix & out) { +in(i+2,j+1) * 0.0208333333333 +in(i+2,j+2) * 0.0625 ; - }); + }); } void grid3(const int n, const int t, matrix & in, matrix & out) { @@ -174,7 +174,7 @@ void grid3(const int n, const int t, matrix & in, matrix & out) { +in(i+3,j+2) * 0.00555555555556 +in(i+3,j+3) * 0.0277777777778 ; - }); + }); } void grid4(const int n, const int t, matrix & in, matrix & out) { @@ -253,7 +253,7 @@ void grid4(const int n, const int t, matrix & in, matrix & out) { +in(i+4,j+3) * 0.00223214285714 +in(i+4,j+4) * 0.015625 ; - }); + }); } void grid5(const int n, const int t, matrix & in, matrix & out) { @@ -370,6 +370,6 @@ void grid5(const int n, const int t, matrix & in, matrix & out) { +in(i+5,j+4) * 0.00111111111111 +in(i+5,j+5) * 0.01 ; - }); + }); } diff --git a/Cxx11/stencil_pgnu.hpp b/Cxx11/stencil_pgnu.hpp index d6c1ee3eb..0db4fedfc 100644 --- a/Cxx11/stencil_pgnu.hpp +++ b/Cxx11/stencil_pgnu.hpp @@ -6,8 +6,8 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector & in, std::vector & in, std::vector & in, std::vector & out) { @@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector & in, std::vector & in, std::vector & in, std::vector & out) { @@ -20,8 +20,8 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -39,8 +39,8 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -62,8 +62,8 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -89,8 +89,8 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -103,8 +103,8 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -131,8 +131,8 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -181,8 +181,8 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -261,8 +261,8 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -379,7 +379,7 @@ void grid5(const int n, const int t, std::vector & in, std::vector > > >;void star1(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(1,n-1); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-1,j+0) * -0.5 + +in(i+0,j+-1) * -0.5 + +in(i+0,j+1) * 0.5 + +in(i+1,j+0) * 0.5; + }); +} + +void star2(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(2,n-2); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-2,j+0) * -0.125 + +in(i+-1,j+0) * -0.25 + +in(i+0,j+-2) * -0.125 + +in(i+0,j+-1) * -0.25 + +in(i+0,j+1) * 0.25 + +in(i+0,j+2) * 0.125 + +in(i+1,j+0) * 0.25 + +in(i+2,j+0) * 0.125; + }); +} + +void star3(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(3,n-3); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-3,j+0) * -0.0555555555556 + +in(i+-2,j+0) * -0.0833333333333 + +in(i+-1,j+0) * -0.166666666667 + +in(i+0,j+-3) * -0.0555555555556 + +in(i+0,j+-2) * -0.0833333333333 + +in(i+0,j+-1) * -0.166666666667 + +in(i+0,j+1) * 0.166666666667 + +in(i+0,j+2) * 0.0833333333333 + +in(i+0,j+3) * 0.0555555555556 + +in(i+1,j+0) * 0.166666666667 + +in(i+2,j+0) * 0.0833333333333 + +in(i+3,j+0) * 0.0555555555556; + }); +} + +void star4(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(4,n-4); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-4,j+0) * -0.03125 + +in(i+-3,j+0) * -0.0416666666667 + +in(i+-2,j+0) * -0.0625 + +in(i+-1,j+0) * -0.125 + +in(i+0,j+-4) * -0.03125 + +in(i+0,j+-3) * -0.0416666666667 + +in(i+0,j+-2) * -0.0625 + +in(i+0,j+-1) * -0.125 + +in(i+0,j+1) * 0.125 + +in(i+0,j+2) * 0.0625 + +in(i+0,j+3) * 0.0416666666667 + +in(i+0,j+4) * 0.03125 + +in(i+1,j+0) * 0.125 + +in(i+2,j+0) * 0.0625 + +in(i+3,j+0) * 0.0416666666667 + +in(i+4,j+0) * 0.03125; + }); +} + +void star5(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(5,n-5); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-5,j+0) * -0.02 + +in(i+-4,j+0) * -0.025 + +in(i+-3,j+0) * -0.0333333333333 + +in(i+-2,j+0) * -0.05 + +in(i+-1,j+0) * -0.1 + +in(i+0,j+-5) * -0.02 + +in(i+0,j+-4) * -0.025 + +in(i+0,j+-3) * -0.0333333333333 + +in(i+0,j+-2) * -0.05 + +in(i+0,j+-1) * -0.1 + +in(i+0,j+1) * 0.1 + +in(i+0,j+2) * 0.05 + +in(i+0,j+3) * 0.0333333333333 + +in(i+0,j+4) * 0.025 + +in(i+0,j+5) * 0.02 + +in(i+1,j+0) * 0.1 + +in(i+2,j+0) * 0.05 + +in(i+3,j+0) * 0.0333333333333 + +in(i+4,j+0) * 0.025 + +in(i+5,j+0) * 0.02; + }); +} + +void grid1(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(1,n-1); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-1,j+-1) * -0.25 + +in(i+-1,j+0) * -0.25 + +in(i+0,j+-1) * -0.25 + +in(i+0,j+1) * 0.25 + +in(i+1,j+0) * 0.25 + +in(i+1,j+1) * 0.25 + ; + }); +} + +void grid2(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(2,n-2); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-2,j+-2) * -0.0625 + +in(i+-2,j+-1) * -0.0208333333333 + +in(i+-2,j+0) * -0.0208333333333 + +in(i+-2,j+1) * -0.0208333333333 + +in(i+-1,j+-2) * -0.0208333333333 + +in(i+-1,j+-1) * -0.125 + +in(i+-1,j+0) * -0.125 + +in(i+-1,j+2) * 0.0208333333333 + +in(i+0,j+-2) * -0.0208333333333 + +in(i+0,j+-1) * -0.125 + +in(i+0,j+1) * 0.125 + +in(i+0,j+2) * 0.0208333333333 + +in(i+1,j+-2) * -0.0208333333333 + +in(i+1,j+0) * 0.125 + +in(i+1,j+1) * 0.125 + +in(i+1,j+2) * 0.0208333333333 + +in(i+2,j+-1) * 0.0208333333333 + +in(i+2,j+0) * 0.0208333333333 + +in(i+2,j+1) * 0.0208333333333 + +in(i+2,j+2) * 0.0625 + ; + }); +} + +void grid3(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(3,n-3); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-3,j+-3) * -0.0277777777778 + +in(i+-3,j+-2) * -0.00555555555556 + +in(i+-3,j+-1) * -0.00555555555556 + +in(i+-3,j+0) * -0.00555555555556 + +in(i+-3,j+1) * -0.00555555555556 + +in(i+-3,j+2) * -0.00555555555556 + +in(i+-2,j+-3) * -0.00555555555556 + +in(i+-2,j+-2) * -0.0416666666667 + +in(i+-2,j+-1) * -0.0138888888889 + +in(i+-2,j+0) * -0.0138888888889 + +in(i+-2,j+1) * -0.0138888888889 + +in(i+-2,j+3) * 0.00555555555556 + +in(i+-1,j+-3) * -0.00555555555556 + +in(i+-1,j+-2) * -0.0138888888889 + +in(i+-1,j+-1) * -0.0833333333333 + +in(i+-1,j+0) * -0.0833333333333 + +in(i+-1,j+2) * 0.0138888888889 + +in(i+-1,j+3) * 0.00555555555556 + +in(i+0,j+-3) * -0.00555555555556 + +in(i+0,j+-2) * -0.0138888888889 + +in(i+0,j+-1) * -0.0833333333333 + +in(i+0,j+1) * 0.0833333333333 + +in(i+0,j+2) * 0.0138888888889 + +in(i+0,j+3) * 0.00555555555556 + +in(i+1,j+-3) * -0.00555555555556 + +in(i+1,j+-2) * -0.0138888888889 + +in(i+1,j+0) * 0.0833333333333 + +in(i+1,j+1) * 0.0833333333333 + +in(i+1,j+2) * 0.0138888888889 + +in(i+1,j+3) * 0.00555555555556 + +in(i+2,j+-3) * -0.00555555555556 + +in(i+2,j+-1) * 0.0138888888889 + +in(i+2,j+0) * 0.0138888888889 + +in(i+2,j+1) * 0.0138888888889 + +in(i+2,j+2) * 0.0416666666667 + +in(i+2,j+3) * 0.00555555555556 + +in(i+3,j+-2) * 0.00555555555556 + +in(i+3,j+-1) * 0.00555555555556 + +in(i+3,j+0) * 0.00555555555556 + +in(i+3,j+1) * 0.00555555555556 + +in(i+3,j+2) * 0.00555555555556 + +in(i+3,j+3) * 0.0277777777778 + ; + }); +} + +void grid4(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(4,n-4); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-4,j+-4) * -0.015625 + +in(i+-4,j+-3) * -0.00223214285714 + +in(i+-4,j+-2) * -0.00223214285714 + +in(i+-4,j+-1) * -0.00223214285714 + +in(i+-4,j+0) * -0.00223214285714 + +in(i+-4,j+1) * -0.00223214285714 + +in(i+-4,j+2) * -0.00223214285714 + +in(i+-4,j+3) * -0.00223214285714 + +in(i+-3,j+-4) * -0.00223214285714 + +in(i+-3,j+-3) * -0.0208333333333 + +in(i+-3,j+-2) * -0.00416666666667 + +in(i+-3,j+-1) * -0.00416666666667 + +in(i+-3,j+0) * -0.00416666666667 + +in(i+-3,j+1) * -0.00416666666667 + +in(i+-3,j+2) * -0.00416666666667 + +in(i+-3,j+4) * 0.00223214285714 + +in(i+-2,j+-4) * -0.00223214285714 + +in(i+-2,j+-3) * -0.00416666666667 + +in(i+-2,j+-2) * -0.03125 + +in(i+-2,j+-1) * -0.0104166666667 + +in(i+-2,j+0) * -0.0104166666667 + +in(i+-2,j+1) * -0.0104166666667 + +in(i+-2,j+3) * 0.00416666666667 + +in(i+-2,j+4) * 0.00223214285714 + +in(i+-1,j+-4) * -0.00223214285714 + +in(i+-1,j+-3) * -0.00416666666667 + +in(i+-1,j+-2) * -0.0104166666667 + +in(i+-1,j+-1) * -0.0625 + +in(i+-1,j+0) * -0.0625 + +in(i+-1,j+2) * 0.0104166666667 + +in(i+-1,j+3) * 0.00416666666667 + +in(i+-1,j+4) * 0.00223214285714 + +in(i+0,j+-4) * -0.00223214285714 + +in(i+0,j+-3) * -0.00416666666667 + +in(i+0,j+-2) * -0.0104166666667 + +in(i+0,j+-1) * -0.0625 + +in(i+0,j+1) * 0.0625 + +in(i+0,j+2) * 0.0104166666667 + +in(i+0,j+3) * 0.00416666666667 + +in(i+0,j+4) * 0.00223214285714 + +in(i+1,j+-4) * -0.00223214285714 + +in(i+1,j+-3) * -0.00416666666667 + +in(i+1,j+-2) * -0.0104166666667 + +in(i+1,j+0) * 0.0625 + +in(i+1,j+1) * 0.0625 + +in(i+1,j+2) * 0.0104166666667 + +in(i+1,j+3) * 0.00416666666667 + +in(i+1,j+4) * 0.00223214285714 + +in(i+2,j+-4) * -0.00223214285714 + +in(i+2,j+-3) * -0.00416666666667 + +in(i+2,j+-1) * 0.0104166666667 + +in(i+2,j+0) * 0.0104166666667 + +in(i+2,j+1) * 0.0104166666667 + +in(i+2,j+2) * 0.03125 + +in(i+2,j+3) * 0.00416666666667 + +in(i+2,j+4) * 0.00223214285714 + +in(i+3,j+-4) * -0.00223214285714 + +in(i+3,j+-2) * 0.00416666666667 + +in(i+3,j+-1) * 0.00416666666667 + +in(i+3,j+0) * 0.00416666666667 + +in(i+3,j+1) * 0.00416666666667 + +in(i+3,j+2) * 0.00416666666667 + +in(i+3,j+3) * 0.0208333333333 + +in(i+3,j+4) * 0.00223214285714 + +in(i+4,j+-3) * 0.00223214285714 + +in(i+4,j+-2) * 0.00223214285714 + +in(i+4,j+-1) * 0.00223214285714 + +in(i+4,j+0) * 0.00223214285714 + +in(i+4,j+1) * 0.00223214285714 + +in(i+4,j+2) * 0.00223214285714 + +in(i+4,j+3) * 0.00223214285714 + +in(i+4,j+4) * 0.015625 + ; + }); +} + +void grid5(const int n, const int t, matrix & in, matrix & out) { + RAJA::RangeSegment inner1(5,n-5); + auto inner2 = RAJA::make_tuple(inner1, inner1); + RAJA::kernel(inner2, [=](int i, int j) { + out(i,j) += +in(i+-5,j+-5) * -0.01 + +in(i+-5,j+-4) * -0.00111111111111 + +in(i+-5,j+-3) * -0.00111111111111 + +in(i+-5,j+-2) * -0.00111111111111 + +in(i+-5,j+-1) * -0.00111111111111 + +in(i+-5,j+0) * -0.00111111111111 + +in(i+-5,j+1) * -0.00111111111111 + +in(i+-5,j+2) * -0.00111111111111 + +in(i+-5,j+3) * -0.00111111111111 + +in(i+-5,j+4) * -0.00111111111111 + +in(i+-4,j+-5) * -0.00111111111111 + +in(i+-4,j+-4) * -0.0125 + +in(i+-4,j+-3) * -0.00178571428571 + +in(i+-4,j+-2) * -0.00178571428571 + +in(i+-4,j+-1) * -0.00178571428571 + +in(i+-4,j+0) * -0.00178571428571 + +in(i+-4,j+1) * -0.00178571428571 + +in(i+-4,j+2) * -0.00178571428571 + +in(i+-4,j+3) * -0.00178571428571 + +in(i+-4,j+5) * 0.00111111111111 + +in(i+-3,j+-5) * -0.00111111111111 + +in(i+-3,j+-4) * -0.00178571428571 + +in(i+-3,j+-3) * -0.0166666666667 + +in(i+-3,j+-2) * -0.00333333333333 + +in(i+-3,j+-1) * -0.00333333333333 + +in(i+-3,j+0) * -0.00333333333333 + +in(i+-3,j+1) * -0.00333333333333 + +in(i+-3,j+2) * -0.00333333333333 + +in(i+-3,j+4) * 0.00178571428571 + +in(i+-3,j+5) * 0.00111111111111 + +in(i+-2,j+-5) * -0.00111111111111 + +in(i+-2,j+-4) * -0.00178571428571 + +in(i+-2,j+-3) * -0.00333333333333 + +in(i+-2,j+-2) * -0.025 + +in(i+-2,j+-1) * -0.00833333333333 + +in(i+-2,j+0) * -0.00833333333333 + +in(i+-2,j+1) * -0.00833333333333 + +in(i+-2,j+3) * 0.00333333333333 + +in(i+-2,j+4) * 0.00178571428571 + +in(i+-2,j+5) * 0.00111111111111 + +in(i+-1,j+-5) * -0.00111111111111 + +in(i+-1,j+-4) * -0.00178571428571 + +in(i+-1,j+-3) * -0.00333333333333 + +in(i+-1,j+-2) * -0.00833333333333 + +in(i+-1,j+-1) * -0.05 + +in(i+-1,j+0) * -0.05 + +in(i+-1,j+2) * 0.00833333333333 + +in(i+-1,j+3) * 0.00333333333333 + +in(i+-1,j+4) * 0.00178571428571 + +in(i+-1,j+5) * 0.00111111111111 + +in(i+0,j+-5) * -0.00111111111111 + +in(i+0,j+-4) * -0.00178571428571 + +in(i+0,j+-3) * -0.00333333333333 + +in(i+0,j+-2) * -0.00833333333333 + +in(i+0,j+-1) * -0.05 + +in(i+0,j+1) * 0.05 + +in(i+0,j+2) * 0.00833333333333 + +in(i+0,j+3) * 0.00333333333333 + +in(i+0,j+4) * 0.00178571428571 + +in(i+0,j+5) * 0.00111111111111 + +in(i+1,j+-5) * -0.00111111111111 + +in(i+1,j+-4) * -0.00178571428571 + +in(i+1,j+-3) * -0.00333333333333 + +in(i+1,j+-2) * -0.00833333333333 + +in(i+1,j+0) * 0.05 + +in(i+1,j+1) * 0.05 + +in(i+1,j+2) * 0.00833333333333 + +in(i+1,j+3) * 0.00333333333333 + +in(i+1,j+4) * 0.00178571428571 + +in(i+1,j+5) * 0.00111111111111 + +in(i+2,j+-5) * -0.00111111111111 + +in(i+2,j+-4) * -0.00178571428571 + +in(i+2,j+-3) * -0.00333333333333 + +in(i+2,j+-1) * 0.00833333333333 + +in(i+2,j+0) * 0.00833333333333 + +in(i+2,j+1) * 0.00833333333333 + +in(i+2,j+2) * 0.025 + +in(i+2,j+3) * 0.00333333333333 + +in(i+2,j+4) * 0.00178571428571 + +in(i+2,j+5) * 0.00111111111111 + +in(i+3,j+-5) * -0.00111111111111 + +in(i+3,j+-4) * -0.00178571428571 + +in(i+3,j+-2) * 0.00333333333333 + +in(i+3,j+-1) * 0.00333333333333 + +in(i+3,j+0) * 0.00333333333333 + +in(i+3,j+1) * 0.00333333333333 + +in(i+3,j+2) * 0.00333333333333 + +in(i+3,j+3) * 0.0166666666667 + +in(i+3,j+4) * 0.00178571428571 + +in(i+3,j+5) * 0.00111111111111 + +in(i+4,j+-5) * -0.00111111111111 + +in(i+4,j+-3) * 0.00178571428571 + +in(i+4,j+-2) * 0.00178571428571 + +in(i+4,j+-1) * 0.00178571428571 + +in(i+4,j+0) * 0.00178571428571 + +in(i+4,j+1) * 0.00178571428571 + +in(i+4,j+2) * 0.00178571428571 + +in(i+4,j+3) * 0.00178571428571 + +in(i+4,j+4) * 0.0125 + +in(i+4,j+5) * 0.00111111111111 + +in(i+5,j+-4) * 0.00111111111111 + +in(i+5,j+-3) * 0.00111111111111 + +in(i+5,j+-2) * 0.00111111111111 + +in(i+5,j+-1) * 0.00111111111111 + +in(i+5,j+0) * 0.00111111111111 + +in(i+5,j+1) * 0.00111111111111 + +in(i+5,j+2) * 0.00111111111111 + +in(i+5,j+3) * 0.00111111111111 + +in(i+5,j+4) * 0.00111111111111 + +in(i+5,j+5) * 0.01 + ; + }); +} + diff --git a/Cxx11/stencil_stl.hpp b/Cxx11/stencil_stl.hpp index 4dcdde467..6633cff00 100644 --- a/Cxx11/stencil_stl.hpp +++ b/Cxx11/stencil_stl.hpp @@ -6,8 +6,8 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -22,8 +22,8 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -42,8 +42,8 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -66,8 +66,8 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -94,8 +94,8 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -109,8 +109,8 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -138,8 +138,8 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -189,8 +189,8 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -270,8 +270,8 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { @@ -389,7 +389,7 @@ void grid5(const int n, const int t, std::vector & in, std::vector <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_raja.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/RAJA Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + bool permute = false; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + + auto permute_input = (argc>4) ? std::atoi(argv[4]) : 0; + if (permute_input != 0 && permute_input != 1) { + throw "ERROR: permute must be 0 (no) or 1 (yes)"; + } + permute = (permute_input == 1); + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Permute loops = " << (permute ? "yes" : "no") << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double trans_time(0); + + double * RESTRICT Amem = new double[order*order]; + double * RESTRICT Bmem = new double[order*order]; + + matrix A(Amem, order, order); + matrix B(Bmem, order, order); + + using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec, + RAJA::statement::For<1, RAJA::simd_exec, + RAJA::statement::Lambda<0> > > >; + using permute_policy = RAJA::KernelPolicy< RAJA::statement::For<1, thread_exec, + RAJA::statement::For<0, RAJA::simd_exec, + RAJA::statement::Lambda<0> > > >; + + RAJA::RangeSegment range(0, order); + auto range2d = RAJA::make_tuple(range, range); + + RAJA::kernel(range2d, [=](int i, int j) { + A(i,j) = static_cast(i*order+j); + B(i,j) = 0.0; + }); + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) trans_time = prk::wtime(); + + if (permute) { + RAJA::kernel(range2d, [=](int i, int j) { + B(i,j) += A(j,i); + A(j,i) += 1.0; + }); + } else { + RAJA::kernel(range2d, [=](int i, int j) { + B(i,j) += A(j,i); + A(j,i) += 1.0; + }); + } + } + trans_time = prk::wtime() - trans_time; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + using reduce_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec, + RAJA::statement::For<1, RAJA::seq_exec, + RAJA::statement::Lambda<0> > > >; + + double const addit = (iterations+1.) * (0.5*iterations); + RAJA::ReduceSum abserr(0.0); + RAJA::kernel(range2d, [=](int i, int j) { + double const ij = static_cast(i*order+j); + double const reference = ij*(1.+iterations)+addit; + abserr += std::fabs(B(j,i) - reference); + }); + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + double epsilon(1.0e-8); + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + return 0; +} + + diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc index 59b757eea..d40cefc65 100644 --- a/Cxx11/transpose-vector-raja.cc +++ b/Cxx11/transpose-vector-raja.cc @@ -270,7 +270,7 @@ int main(int argc, char * argv[]) std::cout << "RAJA use simd = " << (use_simd ? "yes" : "no") << std::endl; ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// std::vector A(order*order); @@ -421,9 +421,9 @@ int main(int argc, char * argv[]) } #endif - auto trans_time = 0.0; + double trans_time(0); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = prk::wtime(); @@ -598,17 +598,16 @@ int main(int argc, char * argv[]) } #endif - #ifdef VERBOSE std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - const auto epsilon = 1.0e-8; + double epsilon(1.0e-8); if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; auto avgtime = trans_time/iterations; auto bytes = (size_t)order * (size_t)order * sizeof(double); - std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime << " Avg time (s): " << avgtime << std::endl; } else { std::cout << "ERROR: Aggregate squared error " << abserr diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 0b0827729..2820eeff8 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -592,11 +592,17 @@ case "$PRK_TARGET" in ;; esac # RAJA - make -C $PRK_TARGET_PATH stencil-vector-raja transpose-vector-raja nstream-vector-raja + make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ + p2p-raja stencil-raja transpose-raja nstream-raja + # New (Views) + $PRK_TARGET_PATH/p2p-raja 10 1024 1024 + $PRK_TARGET_PATH/stencil-raja 10 1000 + $PRK_TARGET_PATH/transpose-raja 10 1024 + $PRK_TARGET_PATH/nstream-raja 10 16777216 32 + # Old (STL) + $PRK_TARGET_PATH/p2p-vector-raja 10 1024 1024 $PRK_TARGET_PATH/stencil-vector-raja 10 1000 - # RAJA variant 11 should be the best $PRK_TARGET_PATH/transpose-vector-raja 10 1024 - # test all the RAJA variants with a smaller problem for f in seq omp tbb ; do for s in y n ; do for t in y n ; do @@ -612,6 +618,7 @@ case "$PRK_TARGET" in for s in star grid ; do for r in 1 2 3 4 5 ; do $PRK_TARGET_PATH/stencil-vector-raja 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-raja 10 200 20 $s $r done done # Kokkos diff --git a/travis/install-raja.sh b/travis/install-raja.sh index fe633f5aa..114b9f2a5 100644 --- a/travis/install-raja.sh +++ b/travis/install-raja.sh @@ -40,8 +40,7 @@ esac ${PRK_CXX} -v if [ ! -d "$TRAVIS_ROOT/raja" ]; then - #BRANCH=develop # forallN deprecated - BRANCH=master + BRANCH=develop git clone --recursive --depth 1 -b ${BRANCH} https://github.com/LLNL/RAJA.git cd RAJA mkdir build From 65d27e74278d5ace0f68ef989f7151c50f10a88f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Jun 2018 09:46:59 -0700 Subject: [PATCH 104/245] refactor Rust files to use Cargo (#351) * refactor Rust files to use Cargo * use cargo in travis * fix path in Travis for Rust * try again with dir --- .gitignore | 6 +- RUST/Makefile | 28 +--- RUST/legacy/Makefile | 34 ++++ RUST/{ => legacy}/p2p.rs | 0 RUST/{ => legacy}/stencil-old.rs | 0 RUST/{ => legacy}/stencil.rs | 0 RUST/{ => legacy}/transpose.rs | 0 RUST/p2p/Cargo.toml | 6 + RUST/p2p/src/main.rs | 175 ++++++++++++++++++++ RUST/stencil/Cargo.toml | 6 + RUST/stencil/src/main.rs | 273 +++++++++++++++++++++++++++++++ RUST/transpose/Cargo.toml | 6 + RUST/transpose/src/main.rs | 190 +++++++++++++++++++++ travis/build-run-prk.sh | 7 +- 14 files changed, 704 insertions(+), 27 deletions(-) create mode 100644 RUST/legacy/Makefile rename RUST/{ => legacy}/p2p.rs (100%) rename RUST/{ => legacy}/stencil-old.rs (100%) rename RUST/{ => legacy}/stencil.rs (100%) rename RUST/{ => legacy}/transpose.rs (100%) create mode 100644 RUST/p2p/Cargo.toml create mode 100644 RUST/p2p/src/main.rs create mode 100644 RUST/stencil/Cargo.toml create mode 100644 RUST/stencil/src/main.rs create mode 100644 RUST/transpose/Cargo.toml create mode 100644 RUST/transpose/src/main.rs diff --git a/.gitignore b/.gitignore index 099e56f2a..b6b88b3b2 100644 --- a/.gitignore +++ b/.gitignore @@ -251,6 +251,6 @@ FORTRAN/transpose-ornlacc FORTRAN/transpose-taskloop-openmp FORTRAN/transpose-tasks-openmp FORTRAN/transpose-ornlacc -RUST/p2p -RUST/stencil -RUST/transpose +RUST/p2p/Cargo.lock +RUST/stencil/Cargo.lock +RUST/transpose/Cargo.lock diff --git a/RUST/Makefile b/RUST/Makefile index 74414a78d..4bbfeb5de 100644 --- a/RUST/Makefile +++ b/RUST/Makefile @@ -1,13 +1,3 @@ -include ../common/RUST.defs -include ../common/PRKVERSION - -ifndef RADIUS - RADIUS=2 -endif - -RUSTC = rustc -RCFLAGS = -g - # Enable verbose printing #RCFLAGS += --cfg "VERBOSE" @@ -18,17 +8,15 @@ RCFLAGS = -g # Stencil shape: star is default, uncomment to switch to grid #RCFLAGS += --cfg grid -.PHONY: all clean run - -all: p2p stencil transpose +.PHONY: all clean -%: %.rs - $(RUSTC) $(RCFLAGS) $< -o $@ +all: + cd p2p && cargo build + cd stencil && cargo build + cd transpose && cargo build clean: - -rm -f *.o - -rm -f *.optrpt - -rm -f *.dwarf - -rm -rf *.dSYM - -rm -f p2p stencil transpose + cd p2p && cargo clean + cd stencil && cargo clean + cd transpose && cargo clean diff --git a/RUST/legacy/Makefile b/RUST/legacy/Makefile new file mode 100644 index 000000000..74414a78d --- /dev/null +++ b/RUST/legacy/Makefile @@ -0,0 +1,34 @@ +include ../common/RUST.defs +include ../common/PRKVERSION + +ifndef RADIUS + RADIUS=2 +endif + +RUSTC = rustc +RCFLAGS = -g + +# Enable verbose printing +#RCFLAGS += --cfg "VERBOSE" + +# This is now a runtime option +# Stencil radius +#RCFLAGS += --cfg radius="$(RADIUS)" + +# Stencil shape: star is default, uncomment to switch to grid +#RCFLAGS += --cfg grid + +.PHONY: all clean run + +all: p2p stencil transpose + +%: %.rs + $(RUSTC) $(RCFLAGS) $< -o $@ + +clean: + -rm -f *.o + -rm -f *.optrpt + -rm -f *.dwarf + -rm -rf *.dSYM + -rm -f p2p stencil transpose + diff --git a/RUST/p2p.rs b/RUST/legacy/p2p.rs similarity index 100% rename from RUST/p2p.rs rename to RUST/legacy/p2p.rs diff --git a/RUST/stencil-old.rs b/RUST/legacy/stencil-old.rs similarity index 100% rename from RUST/stencil-old.rs rename to RUST/legacy/stencil-old.rs diff --git a/RUST/stencil.rs b/RUST/legacy/stencil.rs similarity index 100% rename from RUST/stencil.rs rename to RUST/legacy/stencil.rs diff --git a/RUST/transpose.rs b/RUST/legacy/transpose.rs similarity index 100% rename from RUST/transpose.rs rename to RUST/legacy/transpose.rs diff --git a/RUST/p2p/Cargo.toml b/RUST/p2p/Cargo.toml new file mode 100644 index 000000000..589644538 --- /dev/null +++ b/RUST/p2p/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "p2p" +version = "0.1.0" +authors = ["Jeff Hammond "] + +[dependencies] diff --git a/RUST/p2p/src/main.rs b/RUST/p2p/src/main.rs new file mode 100644 index 000000000..4da63472a --- /dev/null +++ b/RUST/p2p/src/main.rs @@ -0,0 +1,175 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - C99-ification by Jeff Hammond, February 2016. +/// - C++11-ification by Jeff Hammond, May 2017. +/// - Rust port by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +use std::env; +use std::time::{Instant,Duration}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() +{ + println!("Parallel Research Kernels version"); + println!("Rust pipeline execution on 2D grid"); + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let args : Vec = env::args().collect(); + + let iterations : u32; + let m : usize; + let n : usize; + + if args.len() == 4 { + iterations = match args[1].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + m = match args[2].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + n = match args[3].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + } else { + help(); + return; + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + if m < 1 || n < 1 { + println!("ERROR: grid dimensions must be positive: {}, {}", m, n); + } + + println!("Grid sizes = {}, {}", m, n); + println!("Number of iterations = {}", iterations); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and do the work + ////////////////////////////////////////////////////////////////////// + + let nelems : usize = m*n; + let mut vector : Vec = vec![0.0; nelems]; + + // set boundary values (bottom and left side of grid) + for j in 0..n { + vector[0*n+j] = j as f64; + } + for i in 0..m { + vector[i*n+0] = i as f64; + } + + let timer = Instant::now(); + let mut t0 : Duration = timer.elapsed(); + + for k in 0..iterations+1 { + + if k == 1 { t0 = timer.elapsed(); } + + for i in 1..m { + for j in 1..n { + vector[i*n+j] = vector[(i-1)*n+j] + vector[i*n+(j-1)] - vector[(i-1)*n+(j-1)]; + } + } + + // copy top right corner value to bottom left corner to create dependency; we + // need a barrier to make sure the latest value is used. This also guarantees + // that the flags for the next iteration (if any) are not getting clobbered + vector[0*n+0] = -vector[(m-1)*n+(n-1)]; + + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let pipeline_time : f64 = dtt as f64 / 1.0e9_f64 as f64; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // error tolerance + let epsilon : f64 = 1.0e-8; + + // verify correctness, using top right value + let corner_val : f64 = (((iterations+1) as usize)*(n + m as usize - 2 as usize)) as f64; + if ( (vector[(m-1)*n+(n-1)] - corner_val).abs() / corner_val) > epsilon { + println!("ERROR: checksum {} does not match verification value {} ", vector[(m-1)*n+(n-1)], corner_val); + return; + } + + if cfg!(VERBOSE) { + println!("Solution validates; verification value = {}", corner_val); + } else { + println!("Solution validates"); + } + + let avgtime : f64 = (pipeline_time as f64) / (iterations as f64); + let bytes : usize = 2 * (m-1) * (n-1); + println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (bytes as f64) / avgtime, avgtime); +} diff --git a/RUST/stencil/Cargo.toml b/RUST/stencil/Cargo.toml new file mode 100644 index 000000000..6a05b19c6 --- /dev/null +++ b/RUST/stencil/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "stencil" +version = "0.1.0" +authors = ["Jeff Hammond "] + +[dependencies] diff --git a/RUST/stencil/src/main.rs b/RUST/stencil/src/main.rs new file mode 100644 index 000000000..a2bcb9c21 --- /dev/null +++ b/RUST/stencil/src/main.rs @@ -0,0 +1,273 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "a" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// - C++11-ification by Jeff Hammond, May 2017. +/// - Rust port by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +use std::env; +use std::time::{Instant,Duration}; + +fn help() { + println!("Usage: <# iterations> "); +} + +fn main() +{ + println!("Parallel Research Kernels"); + println!("Rust stencil execution on 2D grid"); + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + let args : Vec = env::args().collect(); + + let iterations : usize; + let n : usize; + let r : usize; + + // This is a compile-time setting. + // grid stencil (star is the default) + let grid : bool = if cfg!(grid) { true } else { false }; + + // I have failed to make this a compile-time setting. + /* + let r : usize = + if cfg!(radius = "1") { 1 } else + if cfg!(radius = "2") { 2 } else + if cfg!(radius = "3") { 3 } else + if cfg!(radius = "4") { 4 } else + if cfg!(radius = "5") { 5 } else + if cfg!(radius = "6") { 6 } else + { println!("FAIL"); 0 }; + */ + + if args.len() == 4 { + iterations = match args[1].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + n = match args[2].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + r = match args[3].parse() { + Ok(n) => { n }, + Err(_) => { 2 }, + }; + } else { + help(); + return; + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + if n < 1 { + println!("ERROR: grid dimension must be positive: {}", n); + } + + if r < 1 { + println!("ERROR: Stencil radius {} should be positive ", r); + return; + } else if (2 * r + 1) > n { + println!("ERROR: Stencil radius {} exceeds grid size {}", r, n); + return; + } + + println!("Grid size = {}", n); + println!("Radius of stencil = {}", r); + if grid { + println!("Type of stencil = grid"); + } else { + println!("Type of stencil = star"); + } + println!("Data type = double precision"); + println!("Compact representation of stencil loop body"); + println!("Number of iterations = {}",iterations); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and do the work + ////////////////////////////////////////////////////////////////////// + + // input and output arrays + let mut a : Vec> = vec![vec![0.0; n]; n]; + let mut b : Vec> = vec![vec![0.0; n]; n]; + + // weights of points a the stencil + let wdim : usize = 2 * r + 1; + let mut w : Vec> = vec![vec![0.0; wdim]; wdim]; + + // fill the stencil ws to reflect a discrete divergence operator + let stencil_size : usize; + if grid { + stencil_size = (2*r+1)*(2*r+1); + for j in 1..r+1 { + for i in 1-j..j { + let denom : f64 = (4*j*(2*j-1)*r) as f64; + w[r+i][r+j] = 1./denom; + w[r+i][r-j] = -1./denom; + w[r+j][r+i] = 1./denom; + w[r-j][r+i] = -1./denom; + } + let denom : f64 = (4*j*r) as f64; + w[r+j][r+j] = 1./denom; + w[r-j][r-j] = -1./denom; + } + } else /* star */ { + stencil_size = 4*r+1; + for i in 1..r+1 { + let denom : f64 = (2 * i * r) as f64; + w[r][r+i] = 1./denom; + w[r][r-i] = -1./denom; + w[r+i][r] = 1./denom; + w[r-i][r] = -1./denom; + } + } + + // interior of grid with respect to stencil + let active_points : usize = (n-2*r)*(n-2*r); + + // initialize the input and output arrays + for j in 0..n { + for i in 0..n { + a[i][j] = (i+j) as f64; + b[i][j] = 0.0; + } + } + + let timer = Instant::now(); + let mut t0 : Duration = timer.elapsed(); + + for k in 0..iterations+1 { + + if k == 1 { t0 = timer.elapsed(); } + + // Apply the stencil operator + for i in r..n-r { + for j in r..n-r { + if grid { + for ii in 0-r..r+1 { + for jj in 0-r..r+1 { + b[i][j] += w[r+ii][r+jj]*a[i+ii][j+jj]; + } + } + } else { + b[i][j] += w[r][r]*a[i][j]; + for jj in r..0 { + b[i][j] += w[r][r-jj]*a[i][j-jj]; + } + for jj in 1..r+1 { + b[i][j] += w[r][r+jj]*a[i][j+jj]; + } + for ii in r..0 { + b[i][j] += w[r-ii][r]*a[i-ii][j]; + } + for ii in 1..r+1 { + b[i][j] += w[r+ii][r]*a[i+ii][j]; + } + } + } + } + + // add constant to solution to force refresh of neighbor data, if any + for j in 0..n { + for i in 0..n { + a[i][j] += 1.0; + } + } + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let stencil_time : f64 = dtt as f64 / 1.0e9_f64 as f64; + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // error tolerance + let epsilon : f64 = 1.0e-8; + + // compute L1 norm a parallel + let mut norm : f64 = 0.0; + for i in r..n-r+1 { + for j in r..n-r+1 { + norm += (b[i][j]).abs(); + } + } + norm /= active_points as f64; + + // verify correctness + let reference_norm : f64 = 2.*(iterations as f64 + 1.); + if (norm-reference_norm).abs() > epsilon { + println!("ERROR: L1 norm = {} Reference L1 norm = {}", norm, reference_norm); + return; + } else { + println!("Solution validates"); + if cfg!(VERBOSE) { + println!("L1 norm = {} Reference L1 norm = {}", norm, reference_norm); + } + let flops : usize = (2*stencil_size+1) * active_points; + let avgtime : f64 = (stencil_time as f64) / (iterations as f64); + println!("Rate (MFlops/s): {:10.3} Avg time (s): {:10.3}", (1.0e-6_f64) * (flops as f64) / avgtime, avgtime); + } + +} diff --git a/RUST/transpose/Cargo.toml b/RUST/transpose/Cargo.toml new file mode 100644 index 000000000..3f634d3c5 --- /dev/null +++ b/RUST/transpose/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "transpose" +version = "0.1.0" +authors = ["Jeff Hammond "] + +[dependencies] diff --git a/RUST/transpose/src/main.rs b/RUST/transpose/src/main.rs new file mode 100644 index 000000000..5d1ba1e87 --- /dev/null +++ b/RUST/transpose/src/main.rs @@ -0,0 +1,190 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +use std::env; +use std::mem; +use std::time::{Instant,Duration}; + +fn help() { + println!("Usage: <# iterations> [tile size]"); +} + +fn main() +{ + println!("Parallel Research Kernels"); + println!("Rust Matrix transpose: B = A^T"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + let args : Vec = env::args().collect(); + + let iterations : u32; + let order : usize; + let tilesize : usize; + + match args.len() { + 3 => { + iterations = match args[1].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + order = match args[2].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + tilesize = 32; + }, + 4 => { + iterations = match args[1].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + order = match args[2].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + tilesize = match args[3].parse() { + Ok(n) => { n }, + Err(_) => { help(); return; }, + }; + }, + _ => { + help(); + return; + } + } + + if iterations < 1 { + println!("ERROR: iterations must be >= 1"); + } + if tilesize > order { + println!("ERROR: tilesize cannot be > order"); + } + + println!("Matrix order = {}", order); + if tilesize < order { + println!("Tile size = {}", tilesize); + } else { + println!("Untiled"); + } + println!("Number of iterations = {}", iterations); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + let nelems : usize = order*order; + let mut a : Vec = vec![0.0; nelems]; + let mut b : Vec = vec![0.0; nelems]; + + for i in 0..order { + for j in 0..order { + a[i*order+j] = (i*order+j) as f64; + } + } + + let timer = Instant::now(); + let mut t0 : Duration = timer.elapsed(); + + for k in 0..iterations+1 { + + if k == 1 { t0 = timer.elapsed(); } + + for i in 0..order { + for j in 0..order { + b[j*order+i] += a[i*order+j]; + a[i*order+j] += 1.0; + } + } + + } + let t1 = timer.elapsed(); + let dt = (t1.checked_sub(t0)).unwrap(); + let dtt : u64 = dt.as_secs() * 1_000_000_000 + dt.subsec_nanos() as u64; + let transpose_time : f64 = dtt as f64 * 1.0e-9; + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + let addit : usize = ((iterations as usize + 1) * (iterations as usize)) / 2; + let mut abserr : f64 = 0.0; + for i in 0..order { + for j in 0..order { + let ij = i*order+j; + let ji = j*order+i; + let reference : f64 = (ij*(iterations as usize + 1)+addit) as f64; + abserr += (b[ji] - reference).abs(); + } + } + + if cfg!(VERBOSE) { + println!("Sum of absolute differences: {:30.15}", abserr); + } + + let epsilon : f64 = 1.0e-8; + if abserr < epsilon { + println!("Solution validates"); + let avgtime : f64 = (transpose_time as f64) / (iterations as f64); + let bytes : usize = 2 * nelems * mem::size_of::(); + println!("Rate (MB/s): {:10.3} Avg time (s): {:10.3}", (1.0e0-6_f64) * (bytes as f64) / avgtime, avgtime); + } else { + println!("ERROR: Aggregate squared error {:30.15} exceeds threshold {:30.15}", abserr, epsilon); + return; + } +} + + diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 2820eeff8..4a8ea2230 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -100,11 +100,10 @@ case "$PRK_TARGET" in echo "Rust" which rustc rustc --version - make $PRK_TARGET export PRK_TARGET_PATH=RUST - ./$PRK_TARGET_PATH/p2p 10 100 100 - ./$PRK_TARGET_PATH/stencil 10 100 - ./$PRK_TARGET_PATH/transpose 10 100 + cd $TRAVIS_HOME/$PRK_TARGET_PATH/p2p && cargo run 10 100 100 + cd $TRAVIS_HOME/$PRK_TARGET_PATH/stencil && cargo run 10 100 + cd $TRAVIS_HOME/$PRK_TARGET_PATH/transpose && cargo run 10 100 ;; allc1z) echo "C1z" From 159156ecd145348c8e517eeaa162f05ab8d06e50 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Jun 2018 23:49:11 -0500 Subject: [PATCH 105/245] fix whitespace in RAJA view stencil code gen --- Cxx11/generate-cxx-stencil.py | 6 +++--- Cxx11/stencil_rajaview.hpp | 6 +++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index e2ec18d37..286d0dfb3 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -160,9 +160,9 @@ def main(): if (model=='target'): src.write('#define RESTRICT __restrict__\n\n') if (model=='rajaview'): - src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,') - src.write(' RAJA::statement::For<1, RAJA::simd_exec,') - src.write(' RAJA::statement::Lambda<0> > > >;') + src.write('using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec,\n') + src.write(' RAJA::statement::For<1, RAJA::simd_exec,\n') + src.write(' RAJA::statement::Lambda<0> > > >;\n\n') # src.write('OMP( declare target )\n\n') for pattern in ['star','grid']: for r in range(1,6): diff --git a/Cxx11/stencil_rajaview.hpp b/Cxx11/stencil_rajaview.hpp index 0e303773f..4a521770f 100644 --- a/Cxx11/stencil_rajaview.hpp +++ b/Cxx11/stencil_rajaview.hpp @@ -1,4 +1,8 @@ -using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec, RAJA::statement::For<1, RAJA::simd_exec, RAJA::statement::Lambda<0> > > >;void star1(const int n, const int t, matrix & in, matrix & out) { +using regular_policy = RAJA::KernelPolicy< RAJA::statement::For<0, thread_exec, + RAJA::statement::For<1, RAJA::simd_exec, + RAJA::statement::Lambda<0> > > >; + +void star1(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(1,n-1); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { From 07a8b413e2cab45a1fd129bc4166750c6c1d1ddd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 5 Jun 2018 00:09:54 -0500 Subject: [PATCH 106/245] fix mistype bug in preprocess logic for Boost.Ranges --- Cxx11/prk_ranges.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_ranges.h b/Cxx11/prk_ranges.h index d794016ff..9eb081844 100644 --- a/Cxx11/prk_ranges.h +++ b/Cxx11/prk_ranges.h @@ -33,7 +33,7 @@ #define PRK_RANGES_H #if defined(USE_RANGES) -# if defined(USE_RANGES_IRANGE) +# if defined(USE_BOOST_IRANGE) # include "boost/range/irange.hpp" # elif defined(USE_RANGES_TS) # include "range/v3/view/iota.hpp" From 56cec34a5825a11136983a472ff7b263c48dfe74 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 5 Jun 2018 14:53:13 -0500 Subject: [PATCH 107/245] fix Travis XFAIL MPI1, Python, Charm++, AMPI (#360) * fix copy+paste error * conditional import of timer for Python 2.7 * try Charm++ autoprovision * add numpy version and use dot instead of matmul --- .travis.yml | 3 +-- PYTHON/dgemm-numpy.py | 11 ++++++++--- PYTHON/dgemm.py | 7 +++++-- PYTHON/nstream-numpy.py | 8 ++++++-- PYTHON/nstream.py | 7 +++++-- PYTHON/p2p-numba.py | 7 ++++++- PYTHON/p2p-numpy.py | 8 ++++++-- PYTHON/p2p.py | 7 +++++-- PYTHON/sparse-numpy.py | 8 ++++++-- PYTHON/sparse-scipy.py | 7 +++++-- PYTHON/sparse.py | 7 +++++-- PYTHON/stencil-numba.py | 8 ++++++-- PYTHON/stencil-numpy.py | 8 ++++++-- PYTHON/stencil.py | 7 +++++-- PYTHON/transpose-numpy.py | 8 ++++++-- PYTHON/transpose.py | 7 +++++-- travis/build-run-prk.sh | 14 +++++++++++--- 17 files changed, 97 insertions(+), 35 deletions(-) diff --git a/.travis.yml b/.travis.yml index b79ed39d5..ac0a9f07c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -145,11 +145,10 @@ matrix: - os: linux env: PRK_TARGET=allfgmpi - os: linux + compiler: clang env: PRK_TARGET=allmpi - os: linux env: PRK_TARGET=allcharm++ - # Sadly, Python is XFAIL because Travis CI's Python 3.4.3 can't find cannot process_time. - - env: PRK_TARGET=allpython addons: apt: sources: diff --git a/PYTHON/dgemm-numpy.py b/PYTHON/dgemm-numpy.py index da5bcd197..db6bd9f92 100755 --- a/PYTHON/dgemm-numpy.py +++ b/PYTHON/dgemm-numpy.py @@ -51,9 +51,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def main(): @@ -91,7 +95,8 @@ def main(): if k<1: t0 = timer() - C += numpy.matmul(A,B) + #C += numpy.matmul(A,B) # requires Numpy 1.10 or later + C += numpy.dot(A,B) t1 = timer() dgemm_time = t1 - t0 diff --git a/PYTHON/dgemm.py b/PYTHON/dgemm.py index 0aff9405f..9347d494a 100755 --- a/PYTHON/dgemm.py +++ b/PYTHON/dgemm.py @@ -51,8 +51,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def main(): diff --git a/PYTHON/nstream-numpy.py b/PYTHON/nstream-numpy.py index 593d4fae7..fd0808993 100755 --- a/PYTHON/nstream-numpy.py +++ b/PYTHON/nstream-numpy.py @@ -64,9 +64,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def main(): diff --git a/PYTHON/nstream.py b/PYTHON/nstream.py index b79d979c9..070ec4647 100755 --- a/PYTHON/nstream.py +++ b/PYTHON/nstream.py @@ -64,8 +64,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def main(): diff --git a/PYTHON/p2p-numba.py b/PYTHON/p2p-numba.py index 8d2922c87..c54fe742c 100755 --- a/PYTHON/p2p-numba.py +++ b/PYTHON/p2p-numba.py @@ -52,8 +52,13 @@ # ******************************************************************* import sys -from timeit import default_timer as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) import numba @jit diff --git a/PYTHON/p2p-numpy.py b/PYTHON/p2p-numpy.py index 61818c492..9e16d65eb 100755 --- a/PYTHON/p2p-numpy.py +++ b/PYTHON/p2p-numpy.py @@ -52,9 +52,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def main(): diff --git a/PYTHON/p2p.py b/PYTHON/p2p.py index e5d605e56..6e724f4f9 100755 --- a/PYTHON/p2p.py +++ b/PYTHON/p2p.py @@ -52,8 +52,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def main(): diff --git a/PYTHON/sparse-numpy.py b/PYTHON/sparse-numpy.py index 261afaa64..b62cdfe52 100755 --- a/PYTHON/sparse-numpy.py +++ b/PYTHON/sparse-numpy.py @@ -54,9 +54,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def offset(i,j,lsize): return i+(j<= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy import scipy diff --git a/PYTHON/sparse.py b/PYTHON/sparse.py index 511f13462..a36e5dc6b 100755 --- a/PYTHON/sparse.py +++ b/PYTHON/sparse.py @@ -54,8 +54,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def offset(i,j,lsize): return i+(j<= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer from numba import jit import numpy +print('Numpy version = ', numpy.version.version) @jit def grid(n,r,W,A,B): diff --git a/PYTHON/stencil-numpy.py b/PYTHON/stencil-numpy.py index b152e1c81..29ac31f29 100755 --- a/PYTHON/stencil-numpy.py +++ b/PYTHON/stencil-numpy.py @@ -56,9 +56,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def main(): diff --git a/PYTHON/stencil.py b/PYTHON/stencil.py index 9c8b066d0..618425435 100755 --- a/PYTHON/stencil.py +++ b/PYTHON/stencil.py @@ -56,8 +56,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def main(): diff --git a/PYTHON/transpose-numpy.py b/PYTHON/transpose-numpy.py index 58ee58197..a70c4e741 100755 --- a/PYTHON/transpose-numpy.py +++ b/PYTHON/transpose-numpy.py @@ -50,9 +50,13 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer import numpy +print('Numpy version = ', numpy.version.version) def main(): diff --git a/PYTHON/transpose.py b/PYTHON/transpose.py index a67d365b2..43338aaf9 100755 --- a/PYTHON/transpose.py +++ b/PYTHON/transpose.py @@ -50,8 +50,11 @@ # ******************************************************************* import sys -#from timeit import default_timer as timer -from time import process_time as timer +print('Python version = ', str(sys.version_info.major)+'.'+str(sys.version_info.minor)) +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer def main(): diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 4a8ea2230..cf197e44e 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -916,7 +916,7 @@ case "$PRK_TARGET" in make allmpishm export PRK_TARGET_PATH=MPISHM export PRK_MPI_PROCS=4 - export PRK_RUN="$PRK_RUN -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}" + export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}" export PRK_MPISHM_RANKS=$(($PRK_MPI_PROCS/2)) $PRK_RUN $PRK_TARGET_PATH/Synch_p2p/p2p 10 1024 1024 $PRK_RUN $PRK_TARGET_PATH/Stencil/stencil $PRK_MPISHM_RANKS 10 1000 @@ -1015,7 +1015,11 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=CHARM++ export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun - export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS ++local" + if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync" + else + export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS ++local" + fi # For Charm++, the last argument is the overdecomposition factor --> \|/ $PRK_LAUNCHER $PRK_TARGET_PATH/Synch_p2p/p2p $PRK_LAUNCHER_ARGS 10 1024 1024 1 $PRK_LAUNCHER $PRK_TARGET_PATH/Stencil/stencil $PRK_LAUNCHER_ARGS 10 1000 1 @@ -1039,7 +1043,11 @@ case "$PRK_TARGET" in export PRK_TARGET_PATH=AMPI export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun - export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS +vp$PRK_CHARM_PROCS +isomalloc_sync ++local" + if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then + export PRK_LAUNCHER_ARGS="+autoProvision +isomalloc_sync" + else + export PRK_LAUNCHER_ARGS="+p$PRK_CHARM_PROCS +vp$PRK_CHARM_PROCS +isomalloc_sync ++local" + fi export PRK_LOAD_BALANCER_ARGS="+balancer RefineLB" $PRK_LAUNCHER $PRK_TARGET_PATH/Synch_p2p/p2p $PRK_LAUNCHER_ARGS 10 1024 1024 $PRK_LAUNCHER $PRK_TARGET_PATH/Stencil/stencil $PRK_LAUNCHER_ARGS 10 1000 From 4ffac0bcbae093e0882b2a252f8a033924cba4b6 Mon Sep 17 00:00:00 2001 From: Ronak Buch Date: Mon, 18 Jun 2018 15:53:48 -0500 Subject: [PATCH 108/245] Fix spacing in Charm++ Stencil Makefile (#362) --- CHARM++/Stencil/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHARM++/Stencil/Makefile b/CHARM++/Stencil/Makefile index c936aec7b..9fbde017d 100644 --- a/CHARM++/Stencil/Makefile +++ b/CHARM++/Stencil/Makefile @@ -64,7 +64,7 @@ RESTRICT_KEYWORD=0/1 disable/enable restrict keyword (aliasing) [0] \n\ STAR=0/1 box/star shaped stencil [1] \n\ VERBOSE=0/1 omit/include verbose run information [0]" -TUNEFLAGS = $(RESTRICTFLAG) $(VERBOSEFLAG)$(USERFLAGS) $(LOOPGENFLAG)\ +TUNEFLAGS = $(RESTRICTFLAG) $(VERBOSEFLAG) $(USERFLAGS) $(LOOPGENFLAG)\ $(DOUBLEFLAG) $(RADIUSFLAG) $(STARFLAG) PROGRAM = stencil OBJS = $(PROGRAM).o $(COMOBJS) From a13c1f7f01d903444094b7f6aa78dc61e193d42c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 19 Jul 2018 21:45:08 -0700 Subject: [PATCH 109/245] format fix --- Cxx11/transpose-vector-pstl.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index e94172bd6..ac7aefb8a 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -58,9 +58,9 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #if defined(USE_PSTL) - std::cout << "C++17 Parallel STL Matrix transpose: B = A^T" << std::endl; + std::cout << "C++17/PSTL Matrix transpose: B = A^T" << std::endl; #else - std::cout << "C++11 STL Matrix transpose: B = A^T" << std::endl; + std::cout << "C++11/STL Matrix transpose: B = A^T" << std::endl; #endif ////////////////////////////////////////////////////////////////////// @@ -93,8 +93,8 @@ int main(int argc, char * argv[]) return 1; } - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix From c3c4fca280751d679c8d8b3ec122e3892707113f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 19 Jul 2018 22:05:13 -0700 Subject: [PATCH 110/245] format fix --- Cxx11/transpose-vector-raja.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Cxx11/transpose-vector-raja.cc b/Cxx11/transpose-vector-raja.cc index d40cefc65..621068d43 100644 --- a/Cxx11/transpose-vector-raja.cc +++ b/Cxx11/transpose-vector-raja.cc @@ -260,14 +260,14 @@ int main(int argc, char * argv[]) if (use_for=="tbb") for_name = "TBB (static)"; if (use_for=="tbbdyn") for_name = "TBB (dynamic)"; - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; - std::cout << "Tile size = " << tile_size << "(compile-time constant, unlike other impls)" << std::endl; - std::cout << "RAJA threading = " << for_name << std::endl; - std::cout << "RAJA forallN = " << (use_nested ? "yes" : "no") << std::endl; - std::cout << "RAJA use tiling = " << (use_tiled ? "yes" : "no") << std::endl; - std::cout << "RAJA use permute = " << use_permute << std::endl; - std::cout << "RAJA use simd = " << (use_simd ? "yes" : "no") << std::endl; + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << "(compile-time constant, unlike other impls)" << std::endl; + std::cout << "RAJA threading = " << for_name << std::endl; + std::cout << "RAJA forallN = " << (use_nested ? "yes" : "no") << std::endl; + std::cout << "RAJA use tiling = " << (use_tiled ? "yes" : "no") << std::endl; + std::cout << "RAJA use permute = " << use_permute << std::endl; + std::cout << "RAJA use simd = " << (use_simd ? "yes" : "no") << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation From b6f76ef80f93d75b40e19e7af1debe25108fffe0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 21 Jul 2018 21:42:34 -0700 Subject: [PATCH 111/245] format fix --- Cxx11/nstream-vector-pstl.cc | 4 ++-- Cxx11/p2p-hyperplane-vector-pstl.cc | 4 ++-- Cxx11/stencil-vector-pstl.cc | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index 0bab633b0..21b5e0b45 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -71,9 +71,9 @@ int main(int argc, char * argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #if defined(USE_PSTL) - std::cout << "C++17 Parallel STL STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "C++17/PSTL STREAM triad: A = B + scalar * C" << std::endl; #else - std::cout << "C++11 STL STREAM triad: A = B + scalar * C" << std::endl; + std::cout << "C++11/STL STREAM triad: A = B + scalar * C" << std::endl; #endif ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc index 132b26a45..c64757e6d 100644 --- a/Cxx11/p2p-hyperplane-vector-pstl.cc +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -67,9 +67,9 @@ int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #if defined(USE_PSTL) - std::cout << "C++17 PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl; + std::cout << "C++17/PSTL HYPERPLANE pipeline execution on 2D grid" << std::endl; #else - std::cout << "C++11 STL HYPERPLANE pipeline execution on 2D grid" << std::endl; + std::cout << "C++11/STL HYPERPLANE pipeline execution on 2D grid" << std::endl; #endif ////////////////////////////////////////////////////////////////////// diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index a328b1420..ca3c83ec0 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; #if defined(USE_PSTL) - std::cout << "C++17/Parallel STL Stencil execution on 2D grid" << std::endl; + std::cout << "C++17/PSTL Stencil execution on 2D grid" << std::endl; #else std::cout << "C++11/STL Stencil execution on 2D grid" << std::endl; #endif From 46741c0fc6f271bcef61c91505432eea8d864c60 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 22 Jul 2018 15:54:15 -0700 Subject: [PATCH 112/245] remove STL usage from OpenMP codes (#363) --- .gitignore | 5 ++ Cxx11/Makefile | 12 ++--- Cxx11/generate-cxx-stencil.py | 2 +- ...eam-vector-openmp.cc => nstream-openmp.cc} | 6 +-- ...ector-openmp.cc => p2p-doacross-openmp.cc} | 2 +- ...tor-openmp.cc => p2p-hyperplane-openmp.cc} | 2 +- ...cil-vector-openmp.cc => stencil-openmp.cc} | 9 ++-- Cxx11/stencil_openmp.hpp | 20 ++++---- ...e-vector-openmp.cc => transpose-openmp.cc} | 4 +- travis/build-run-prk.sh | 48 +++++++++---------- 10 files changed, 57 insertions(+), 53 deletions(-) rename Cxx11/{nstream-vector-openmp.cc => nstream-openmp.cc} (97%) rename Cxx11/{p2p-doacross-vector-openmp.cc => p2p-doacross-openmp.cc} (99%) rename Cxx11/{p2p-hyperplane-vector-openmp.cc => p2p-hyperplane-openmp.cc} (99%) rename Cxx11/{stencil-vector-openmp.cc => stencil-openmp.cc} (97%) rename Cxx11/{transpose-vector-openmp.cc => transpose-openmp.cc} (98%) diff --git a/.gitignore b/.gitignore index b6b88b3b2..d4a60c93f 100644 --- a/.gitignore +++ b/.gitignore @@ -254,3 +254,8 @@ FORTRAN/transpose-ornlacc RUST/p2p/Cargo.lock RUST/stencil/Cargo.lock RUST/transpose/Cargo.lock +nstream-openmp +p2p-doacross-openmp +p2p-hyperplane-openmp +stencil-openmp +transpose-openmp diff --git a/Cxx11/Makefile b/Cxx11/Makefile index d1223b894..54873e41d 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -68,20 +68,20 @@ endif all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) -p2p: p2p-vector p2p-doacross-vector-openmp p2p-hyperplane-vector-openmp p2p-tasks-openmp p2p-openmp-target \ +p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb -stencil: stencil-valarray stencil-vector stencil-vector-async stencil-vector-openmp stencil-openmp-target \ +stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \ stencil-cuda -transpose: transpose-valarray transpose-vector transpose-vector-async transpose-vector-openmp transpose-openmp-target \ +transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \ transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \ transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl -nstream: nstream-valarray nstream-vector nstream-vector-openmp nstream-openmp-target \ +nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \ nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \ nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl @@ -92,7 +92,7 @@ vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream valarray: transpose-valarray nstream-valarray -openmp: p2p-hyperplane-vector-openmp p2p-tasks-openmp stencil-vector-openmp transpose-vector-openmp nstream-vector-openmp +openmp: p2p-hyperplane-openmp p2p-tasks-openmp stencil-openmp transpose-openmp nstream-openmp target: stencil-openmp-target transpose-openmp-target nstream-openmp-target @@ -135,7 +135,7 @@ boost-compute: nstream-vector-boost-compute # busted #nstream-valarray-boost-compute -p2p-hyperplane-vector: p2p-hyperplane-vector-openmp.cc prk_util.h +p2p-hyperplane-vector: p2p-hyperplane-openmp.cc prk_util.h $(CXX) $(CXXFLAGS) $< -o $@ transpose-opencl: transpose-opencl.cc transpose.cl prk_util.h prk_opencl.h diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 286d0dfb3..b6d7a2d72 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -7,7 +7,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): if (model=='openmp'): - src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') + src.write('void '+pattern+str(radius)+'(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) {\n') src.write(' OMP_FOR(collapse(2))\n') src.write(' for (auto it='+str(radius)+'; it A(length); - std::vector B(length); - std::vector C(length); + double * RESTRICT A = new double[length]; + double * RESTRICT B = new double[length]; + double * RESTRICT C = new double[length]; double scalar = 3.0; diff --git a/Cxx11/p2p-doacross-vector-openmp.cc b/Cxx11/p2p-doacross-openmp.cc similarity index 99% rename from Cxx11/p2p-doacross-vector-openmp.cc rename to Cxx11/p2p-doacross-openmp.cc index 37b9802f0..4b10f9fbe 100644 --- a/Cxx11/p2p-doacross-vector-openmp.cc +++ b/Cxx11/p2p-doacross-openmp.cc @@ -125,7 +125,7 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - std::vector grid(m*n);; + double * RESTRICT grid = new double[m*n]; OMP_PARALLEL() { diff --git a/Cxx11/p2p-hyperplane-vector-openmp.cc b/Cxx11/p2p-hyperplane-openmp.cc similarity index 99% rename from Cxx11/p2p-hyperplane-vector-openmp.cc rename to Cxx11/p2p-hyperplane-openmp.cc index 471ce336c..4a49584c9 100644 --- a/Cxx11/p2p-hyperplane-vector-openmp.cc +++ b/Cxx11/p2p-hyperplane-openmp.cc @@ -127,7 +127,7 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - double * grid = new double[n*n]; + double * RESTRICT grid = new double[n*n]; OMP_PARALLEL() { diff --git a/Cxx11/stencil-vector-openmp.cc b/Cxx11/stencil-openmp.cc similarity index 97% rename from Cxx11/stencil-vector-openmp.cc rename to Cxx11/stencil-openmp.cc index 5f5e59f42..14d299c58 100644 --- a/Cxx11/stencil-vector-openmp.cc +++ b/Cxx11/stencil-openmp.cc @@ -67,13 +67,13 @@ #include "stencil_seq.hpp" #endif -void nothing(const int n, const int t, std::vector & in, std::vector & out) +void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; std::cout << "and add it to the case-switch in the driver." << std::endl; // n will never be zero - this is to silence compiler warnings. - if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; + if (n==0 || t==0) std::cout << in << out << std::endl; std::abort(); } @@ -175,8 +175,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in(n*n); - std::vector out(n*n); + double * RESTRICT in = new double[n*n]; + double * RESTRICT out = new double[n*n]; OMP_PARALLEL() { @@ -227,7 +227,6 @@ int main(int argc, char* argv[]) // interior of grid with respect to stencil size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); - // compute L1 norm in parallel double norm = 0.0; OMP_PARALLEL_FOR_REDUCE( +:norm ) diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp index 42edf4570..2a42b437c 100644 --- a/Cxx11/stencil_openmp.hpp +++ b/Cxx11/stencil_openmp.hpp @@ -1,4 +1,4 @@ -void star1(const int n, const int t, std::vector & in, std::vector & out) { +void star1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=1; it & in, std::vector & in, std::vector & out) { +void star2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=2; it & in, std::vector & in, std::vector & out) { +void star3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=3; it & in, std::vector & in, std::vector & out) { +void star4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=4; it & in, std::vector & in, std::vector & out) { +void star5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=5; it & in, std::vector & in, std::vector & out) { +void grid1(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=1; it & in, std::vector & in, std::vector & out) { +void grid2(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=2; it & in, std::vector & in, std::vector & out) { +void grid3(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=3; it & in, std::vector & in, std::vector & out) { +void grid4(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=4; it & in, std::vector & in, std::vector & out) { +void grid5(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { OMP_FOR(collapse(2)) for (auto it=5; it A(order*order); - std::vector B(order*order); + double * RESTRICT A = new double[order*order]; + double * RESTRICT B = new double[order*order]; OMP_PARALLEL() { diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index cf197e44e..e4b25c3cb 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -359,18 +359,18 @@ case "$PRK_TARGET" in gcc) # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \ - transpose-vector-openmp nstream-vector-openmp + make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ + transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 - $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 64 - $PRK_TARGET_PATH/stencil-vector-openmp 10 1000 - $PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 64 + $PRK_TARGET_PATH/stencil-openmp 10 1000 + $PRK_TARGET_PATH/transpose-openmp 10 1024 32 + $PRK_TARGET_PATH/nstream-openmp 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r done done # Offload @@ -381,7 +381,7 @@ case "$PRK_TARGET" in #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r done done # ORNL-ACC @@ -394,18 +394,18 @@ case "$PRK_TARGET" in if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-vector-openmp stencil-vector-openmp \ - transpose-vector-openmp nstream-vector-openmp + make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ + transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 - $PRK_TARGET_PATH/p2p-hyperplane-vector-openmp 10 1024 64 - $PRK_TARGET_PATH/stencil-vector-openmp 10 1000 - $PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 64 + $PRK_TARGET_PATH/stencil-openmp 10 1000 + $PRK_TARGET_PATH/transpose-openmp 10 1024 32 + $PRK_TARGET_PATH/nstream-openmp 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r done done # Offload @@ -416,7 +416,7 @@ case "$PRK_TARGET" in ##echo "Test stencil code generator" #for s in star grid ; do # for r in 1 2 3 4 5 ; do - # $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + # $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r # done #done else @@ -426,17 +426,17 @@ case "$PRK_TARGET" in icc) # Host echo "OPENMPFLAG=-qopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-vector-openmp \ - transpose-vector-openmp nstream-vector-openmp + make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \ + transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 1024 - $PRK_TARGET_PATH/stencil-vector-openmp 10 1000 - $PRK_TARGET_PATH/transpose-vector-openmp 10 1024 32 - $PRK_TARGET_PATH/nstream-vector-openmp 10 16777216 32 + $PRK_TARGET_PATH/stencil-openmp 10 1000 + $PRK_TARGET_PATH/transpose-openmp 10 1024 32 + $PRK_TARGET_PATH/nstream-openmp 10 16777216 32 #echo "Test stencil code generator" for s in star grid ; do for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-vector-openmp 10 200 20 $s $r + $PRK_TARGET_PATH/stencil-openmp 10 200 20 $s $r done done # Offload - not supported on MacOS From f1968f1eb8156335dbdc5d8111fe0fdb89750053 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 14 Sep 2018 16:19:03 -0700 Subject: [PATCH 113/245] improve stencil code generator (#364) * improve stencil code generator * try C++17 --- Cxx11/generate-cxx-stencil.py | 112 ++++--- Cxx11/stencil_cuda.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_kokkos.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_openmp.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_pgnu.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_pstl.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_raja.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_rajaview.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_rangefor.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_seq.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_stl.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_target.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_taskloop.hpp | 570 +++++++++++++++++----------------- Cxx11/stencil_tbb.hpp | 570 +++++++++++++++++----------------- travis/build-run-prk.sh | 4 +- 15 files changed, 3781 insertions(+), 3745 deletions(-) diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index b6d7a2d72..18d826acd 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -5,6 +5,39 @@ import string import os +def bodygen(src,pattern,stencil_size,radius,W,model): + if (model=='kokkos' or model=='rajaview'): + src.write(' out(i,j) += ') + else: + src.write(' out[i*n+j] += ') + k = 0 + kmax = stencil_size-1; + for j in range(0,2*radius+1): + if (j-radius)<0: + jr=str(j-radius) + elif (j-radius)==0: + jr='' + else: + jr='+'+str(j-radius) + + for i in range(0,2*radius+1): + if (i-radius)<0: + ir=str(i-radius) + elif (i-radius)==0: + ir='' + else: + ir='+'+str(i-radius) + + if ( W[j][i] != 0.0): + k+=1 + if (model=='kokkos' or model=='rajaview'): + src.write('+in(i'+ir+',j'+jr+') * '+str(W[j][i])) + else: + src.write('+in[(i'+ir+')*n+(j'+jr+')] * '+str(W[j][i])) + if (k0 and k & in, std::vector & out, const int gs) {\n') src.write(' OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )\n') @@ -22,33 +60,55 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' for (auto i=it; i & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' for (auto i : inside) {\n') src.write(' PRAGMA_SIMD\n') src.write(' for (auto j : inside) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' }\n') + src.write(' }\n') + src.write(' }\n') + src.write(' }\n') elif (model=='stl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n') #src.write(' PRAGMA_SIMD\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') + src.write(' });\n') elif (model=='pgnu'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' __gnu_parallel::for_each( std::begin(inside), std::end(inside), [&] (int i) {\n') src.write(' std::for_each( std::begin(inside), std::end(inside), [&] (int j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') + src.write(' });\n') elif (model=='pstl'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' std::for_each( exec::par, std::begin(inside), std::end(inside), [&] (int i) {\n') src.write(' std::for_each( exec::unseq, std::begin(inside), std::end(inside), [&] (int j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') + src.write(' });\n') elif (model=='raja'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') #src.write(' RAJA::forallN>>\n') @@ -57,11 +117,16 @@ def codegen(src,pattern,stencil_size,radius,W,model): #src.write(' [&](RAJA::Index_type i, RAJA::Index_type j) {\n') src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n') src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') + src.write(' });\n') elif (model=='rajaview'): src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') src.write(' RAJA::RangeSegment inner1('+str(radius)+',n-'+str(radius)+');\n') src.write(' auto inner2 = RAJA::make_tuple(inner1, inner1);\n') src.write(' RAJA::kernel(inner2, [=](int i, int j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') elif (model=='tbb'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' tbb::blocked_range2d range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n') @@ -69,15 +134,23 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {\n') src.write(' PRAGMA_SIMD\n') src.write(' for (auto j=r.cols().begin(); j!=r.cols().end(); ++j ) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' }\n') + src.write(' }\n') + src.write(' }, tbb_partitioner );\n') elif (model=='kokkos'): src.write('void '+pattern+str(radius)+'(const int n, const int t, matrix & in, matrix & out) {\n') src.write(' auto inside = Kokkos::MDRangePolicy>({'+str(radius)+','+str(radius)+'},{n-'+str(radius)+',n-'+str(radius)+'},{t,t});\n') src.write(' Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' });\n') elif (model=='cuda'): src.write('__global__ void '+pattern+str(radius)+'(const int n, const prk_float * in, prk_float * out) {\n') src.write(' const int i = blockIdx.x * blockDim.x + threadIdx.x;\n') src.write(' const int j = blockIdx.y * blockDim.y + threadIdx.y;\n') src.write(' if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n') + bodygen(src,pattern,stencil_size,radius,W,model) + src.write(' }\n') else: src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' for (auto it='+str(radius)+'; it0 and k>({1,1},{n-1,n-1},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-1,j+0) * -0.5 - +in(i+0,j+-1) * -0.5 - +in(i+0,j+1) * 0.5 - +in(i+1,j+0) * 0.5; + out(i,j) += +in(i,j-1) * -0.5 + +in(i-1,j) * -0.5 + +in(i+1,j) * 0.5 + +in(i,j+1) * 0.5; }); } void star2(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({2,2},{n-2,n-2},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-2,j+0) * -0.125 - +in(i+-1,j+0) * -0.25 - +in(i+0,j+-2) * -0.125 - +in(i+0,j+-1) * -0.25 - +in(i+0,j+1) * 0.25 - +in(i+0,j+2) * 0.125 - +in(i+1,j+0) * 0.25 - +in(i+2,j+0) * 0.125; + out(i,j) += +in(i,j-2) * -0.125 + +in(i,j-1) * -0.25 + +in(i-2,j) * -0.125 + +in(i-1,j) * -0.25 + +in(i+1,j) * 0.25 + +in(i+2,j) * 0.125 + +in(i,j+1) * 0.25 + +in(i,j+2) * 0.125; }); } void star3(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({3,3},{n-3,n-3},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-3,j+0) * -0.0555555555556 - +in(i+-2,j+0) * -0.0833333333333 - +in(i+-1,j+0) * -0.166666666667 - +in(i+0,j+-3) * -0.0555555555556 - +in(i+0,j+-2) * -0.0833333333333 - +in(i+0,j+-1) * -0.166666666667 - +in(i+0,j+1) * 0.166666666667 - +in(i+0,j+2) * 0.0833333333333 - +in(i+0,j+3) * 0.0555555555556 - +in(i+1,j+0) * 0.166666666667 - +in(i+2,j+0) * 0.0833333333333 - +in(i+3,j+0) * 0.0555555555556; + out(i,j) += +in(i,j-3) * -0.0555555555556 + +in(i,j-2) * -0.0833333333333 + +in(i,j-1) * -0.166666666667 + +in(i-3,j) * -0.0555555555556 + +in(i-2,j) * -0.0833333333333 + +in(i-1,j) * -0.166666666667 + +in(i+1,j) * 0.166666666667 + +in(i+2,j) * 0.0833333333333 + +in(i+3,j) * 0.0555555555556 + +in(i,j+1) * 0.166666666667 + +in(i,j+2) * 0.0833333333333 + +in(i,j+3) * 0.0555555555556; }); } void star4(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({4,4},{n-4,n-4},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-4,j+0) * -0.03125 - +in(i+-3,j+0) * -0.0416666666667 - +in(i+-2,j+0) * -0.0625 - +in(i+-1,j+0) * -0.125 - +in(i+0,j+-4) * -0.03125 - +in(i+0,j+-3) * -0.0416666666667 - +in(i+0,j+-2) * -0.0625 - +in(i+0,j+-1) * -0.125 - +in(i+0,j+1) * 0.125 - +in(i+0,j+2) * 0.0625 - +in(i+0,j+3) * 0.0416666666667 - +in(i+0,j+4) * 0.03125 - +in(i+1,j+0) * 0.125 - +in(i+2,j+0) * 0.0625 - +in(i+3,j+0) * 0.0416666666667 - +in(i+4,j+0) * 0.03125; + out(i,j) += +in(i,j-4) * -0.03125 + +in(i,j-3) * -0.0416666666667 + +in(i,j-2) * -0.0625 + +in(i,j-1) * -0.125 + +in(i-4,j) * -0.03125 + +in(i-3,j) * -0.0416666666667 + +in(i-2,j) * -0.0625 + +in(i-1,j) * -0.125 + +in(i+1,j) * 0.125 + +in(i+2,j) * 0.0625 + +in(i+3,j) * 0.0416666666667 + +in(i+4,j) * 0.03125 + +in(i,j+1) * 0.125 + +in(i,j+2) * 0.0625 + +in(i,j+3) * 0.0416666666667 + +in(i,j+4) * 0.03125; }); } void star5(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({5,5},{n-5,n-5},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-5,j+0) * -0.02 - +in(i+-4,j+0) * -0.025 - +in(i+-3,j+0) * -0.0333333333333 - +in(i+-2,j+0) * -0.05 - +in(i+-1,j+0) * -0.1 - +in(i+0,j+-5) * -0.02 - +in(i+0,j+-4) * -0.025 - +in(i+0,j+-3) * -0.0333333333333 - +in(i+0,j+-2) * -0.05 - +in(i+0,j+-1) * -0.1 - +in(i+0,j+1) * 0.1 - +in(i+0,j+2) * 0.05 - +in(i+0,j+3) * 0.0333333333333 - +in(i+0,j+4) * 0.025 - +in(i+0,j+5) * 0.02 - +in(i+1,j+0) * 0.1 - +in(i+2,j+0) * 0.05 - +in(i+3,j+0) * 0.0333333333333 - +in(i+4,j+0) * 0.025 - +in(i+5,j+0) * 0.02; + out(i,j) += +in(i,j-5) * -0.02 + +in(i,j-4) * -0.025 + +in(i,j-3) * -0.0333333333333 + +in(i,j-2) * -0.05 + +in(i,j-1) * -0.1 + +in(i-5,j) * -0.02 + +in(i-4,j) * -0.025 + +in(i-3,j) * -0.0333333333333 + +in(i-2,j) * -0.05 + +in(i-1,j) * -0.1 + +in(i+1,j) * 0.1 + +in(i+2,j) * 0.05 + +in(i+3,j) * 0.0333333333333 + +in(i+4,j) * 0.025 + +in(i+5,j) * 0.02 + +in(i,j+1) * 0.1 + +in(i,j+2) * 0.05 + +in(i,j+3) * 0.0333333333333 + +in(i,j+4) * 0.025 + +in(i,j+5) * 0.02; }); } void grid1(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({1,1},{n-1,n-1},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-1,j+-1) * -0.25 - +in(i+-1,j+0) * -0.25 - +in(i+0,j+-1) * -0.25 - +in(i+0,j+1) * 0.25 - +in(i+1,j+0) * 0.25 + out(i,j) += +in(i-1,j-1) * -0.25 + +in(i,j-1) * -0.25 + +in(i-1,j) * -0.25 + +in(i+1,j) * 0.25 + +in(i,j+1) * 0.25 +in(i+1,j+1) * 0.25 ; }); @@ -104,25 +104,25 @@ void grid1(const int n, const int t, matrix & in, matrix & out) { void grid2(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({2,2},{n-2,n-2},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-2,j+-2) * -0.0625 - +in(i+-2,j+-1) * -0.0208333333333 - +in(i+-2,j+0) * -0.0208333333333 - +in(i+-2,j+1) * -0.0208333333333 - +in(i+-1,j+-2) * -0.0208333333333 - +in(i+-1,j+-1) * -0.125 - +in(i+-1,j+0) * -0.125 - +in(i+-1,j+2) * 0.0208333333333 - +in(i+0,j+-2) * -0.0208333333333 - +in(i+0,j+-1) * -0.125 - +in(i+0,j+1) * 0.125 - +in(i+0,j+2) * 0.0208333333333 - +in(i+1,j+-2) * -0.0208333333333 - +in(i+1,j+0) * 0.125 + out(i,j) += +in(i-2,j-2) * -0.0625 + +in(i-1,j-2) * -0.0208333333333 + +in(i,j-2) * -0.0208333333333 + +in(i+1,j-2) * -0.0208333333333 + +in(i-2,j-1) * -0.0208333333333 + +in(i-1,j-1) * -0.125 + +in(i,j-1) * -0.125 + +in(i+2,j-1) * 0.0208333333333 + +in(i-2,j) * -0.0208333333333 + +in(i-1,j) * -0.125 + +in(i+1,j) * 0.125 + +in(i+2,j) * 0.0208333333333 + +in(i-2,j+1) * -0.0208333333333 + +in(i,j+1) * 0.125 +in(i+1,j+1) * 0.125 - +in(i+1,j+2) * 0.0208333333333 - +in(i+2,j+-1) * 0.0208333333333 - +in(i+2,j+0) * 0.0208333333333 +in(i+2,j+1) * 0.0208333333333 + +in(i-1,j+2) * 0.0208333333333 + +in(i,j+2) * 0.0208333333333 + +in(i+1,j+2) * 0.0208333333333 +in(i+2,j+2) * 0.0625 ; }); @@ -131,47 +131,47 @@ void grid2(const int n, const int t, matrix & in, matrix & out) { void grid3(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({3,3},{n-3,n-3},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-3,j+-3) * -0.0277777777778 - +in(i+-3,j+-2) * -0.00555555555556 - +in(i+-3,j+-1) * -0.00555555555556 - +in(i+-3,j+0) * -0.00555555555556 - +in(i+-3,j+1) * -0.00555555555556 - +in(i+-3,j+2) * -0.00555555555556 - +in(i+-2,j+-3) * -0.00555555555556 - +in(i+-2,j+-2) * -0.0416666666667 - +in(i+-2,j+-1) * -0.0138888888889 - +in(i+-2,j+0) * -0.0138888888889 - +in(i+-2,j+1) * -0.0138888888889 - +in(i+-2,j+3) * 0.00555555555556 - +in(i+-1,j+-3) * -0.00555555555556 - +in(i+-1,j+-2) * -0.0138888888889 - +in(i+-1,j+-1) * -0.0833333333333 - +in(i+-1,j+0) * -0.0833333333333 - +in(i+-1,j+2) * 0.0138888888889 - +in(i+-1,j+3) * 0.00555555555556 - +in(i+0,j+-3) * -0.00555555555556 - +in(i+0,j+-2) * -0.0138888888889 - +in(i+0,j+-1) * -0.0833333333333 - +in(i+0,j+1) * 0.0833333333333 - +in(i+0,j+2) * 0.0138888888889 - +in(i+0,j+3) * 0.00555555555556 - +in(i+1,j+-3) * -0.00555555555556 - +in(i+1,j+-2) * -0.0138888888889 - +in(i+1,j+0) * 0.0833333333333 + out(i,j) += +in(i-3,j-3) * -0.0277777777778 + +in(i-2,j-3) * -0.00555555555556 + +in(i-1,j-3) * -0.00555555555556 + +in(i,j-3) * -0.00555555555556 + +in(i+1,j-3) * -0.00555555555556 + +in(i+2,j-3) * -0.00555555555556 + +in(i-3,j-2) * -0.00555555555556 + +in(i-2,j-2) * -0.0416666666667 + +in(i-1,j-2) * -0.0138888888889 + +in(i,j-2) * -0.0138888888889 + +in(i+1,j-2) * -0.0138888888889 + +in(i+3,j-2) * 0.00555555555556 + +in(i-3,j-1) * -0.00555555555556 + +in(i-2,j-1) * -0.0138888888889 + +in(i-1,j-1) * -0.0833333333333 + +in(i,j-1) * -0.0833333333333 + +in(i+2,j-1) * 0.0138888888889 + +in(i+3,j-1) * 0.00555555555556 + +in(i-3,j) * -0.00555555555556 + +in(i-2,j) * -0.0138888888889 + +in(i-1,j) * -0.0833333333333 + +in(i+1,j) * 0.0833333333333 + +in(i+2,j) * 0.0138888888889 + +in(i+3,j) * 0.00555555555556 + +in(i-3,j+1) * -0.00555555555556 + +in(i-2,j+1) * -0.0138888888889 + +in(i,j+1) * 0.0833333333333 +in(i+1,j+1) * 0.0833333333333 - +in(i+1,j+2) * 0.0138888888889 - +in(i+1,j+3) * 0.00555555555556 - +in(i+2,j+-3) * -0.00555555555556 - +in(i+2,j+-1) * 0.0138888888889 - +in(i+2,j+0) * 0.0138888888889 +in(i+2,j+1) * 0.0138888888889 - +in(i+2,j+2) * 0.0416666666667 - +in(i+2,j+3) * 0.00555555555556 - +in(i+3,j+-2) * 0.00555555555556 - +in(i+3,j+-1) * 0.00555555555556 - +in(i+3,j+0) * 0.00555555555556 +in(i+3,j+1) * 0.00555555555556 + +in(i-3,j+2) * -0.00555555555556 + +in(i-1,j+2) * 0.0138888888889 + +in(i,j+2) * 0.0138888888889 + +in(i+1,j+2) * 0.0138888888889 + +in(i+2,j+2) * 0.0416666666667 +in(i+3,j+2) * 0.00555555555556 + +in(i-2,j+3) * 0.00555555555556 + +in(i-1,j+3) * 0.00555555555556 + +in(i,j+3) * 0.00555555555556 + +in(i+1,j+3) * 0.00555555555556 + +in(i+2,j+3) * 0.00555555555556 +in(i+3,j+3) * 0.0277777777778 ; }); @@ -180,77 +180,77 @@ void grid3(const int n, const int t, matrix & in, matrix & out) { void grid4(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({4,4},{n-4,n-4},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-4,j+-4) * -0.015625 - +in(i+-4,j+-3) * -0.00223214285714 - +in(i+-4,j+-2) * -0.00223214285714 - +in(i+-4,j+-1) * -0.00223214285714 - +in(i+-4,j+0) * -0.00223214285714 - +in(i+-4,j+1) * -0.00223214285714 - +in(i+-4,j+2) * -0.00223214285714 - +in(i+-4,j+3) * -0.00223214285714 - +in(i+-3,j+-4) * -0.00223214285714 - +in(i+-3,j+-3) * -0.0208333333333 - +in(i+-3,j+-2) * -0.00416666666667 - +in(i+-3,j+-1) * -0.00416666666667 - +in(i+-3,j+0) * -0.00416666666667 - +in(i+-3,j+1) * -0.00416666666667 - +in(i+-3,j+2) * -0.00416666666667 - +in(i+-3,j+4) * 0.00223214285714 - +in(i+-2,j+-4) * -0.00223214285714 - +in(i+-2,j+-3) * -0.00416666666667 - +in(i+-2,j+-2) * -0.03125 - +in(i+-2,j+-1) * -0.0104166666667 - +in(i+-2,j+0) * -0.0104166666667 - +in(i+-2,j+1) * -0.0104166666667 - +in(i+-2,j+3) * 0.00416666666667 - +in(i+-2,j+4) * 0.00223214285714 - +in(i+-1,j+-4) * -0.00223214285714 - +in(i+-1,j+-3) * -0.00416666666667 - +in(i+-1,j+-2) * -0.0104166666667 - +in(i+-1,j+-1) * -0.0625 - +in(i+-1,j+0) * -0.0625 - +in(i+-1,j+2) * 0.0104166666667 - +in(i+-1,j+3) * 0.00416666666667 - +in(i+-1,j+4) * 0.00223214285714 - +in(i+0,j+-4) * -0.00223214285714 - +in(i+0,j+-3) * -0.00416666666667 - +in(i+0,j+-2) * -0.0104166666667 - +in(i+0,j+-1) * -0.0625 - +in(i+0,j+1) * 0.0625 - +in(i+0,j+2) * 0.0104166666667 - +in(i+0,j+3) * 0.00416666666667 - +in(i+0,j+4) * 0.00223214285714 - +in(i+1,j+-4) * -0.00223214285714 - +in(i+1,j+-3) * -0.00416666666667 - +in(i+1,j+-2) * -0.0104166666667 - +in(i+1,j+0) * 0.0625 + out(i,j) += +in(i-4,j-4) * -0.015625 + +in(i-3,j-4) * -0.00223214285714 + +in(i-2,j-4) * -0.00223214285714 + +in(i-1,j-4) * -0.00223214285714 + +in(i,j-4) * -0.00223214285714 + +in(i+1,j-4) * -0.00223214285714 + +in(i+2,j-4) * -0.00223214285714 + +in(i+3,j-4) * -0.00223214285714 + +in(i-4,j-3) * -0.00223214285714 + +in(i-3,j-3) * -0.0208333333333 + +in(i-2,j-3) * -0.00416666666667 + +in(i-1,j-3) * -0.00416666666667 + +in(i,j-3) * -0.00416666666667 + +in(i+1,j-3) * -0.00416666666667 + +in(i+2,j-3) * -0.00416666666667 + +in(i+4,j-3) * 0.00223214285714 + +in(i-4,j-2) * -0.00223214285714 + +in(i-3,j-2) * -0.00416666666667 + +in(i-2,j-2) * -0.03125 + +in(i-1,j-2) * -0.0104166666667 + +in(i,j-2) * -0.0104166666667 + +in(i+1,j-2) * -0.0104166666667 + +in(i+3,j-2) * 0.00416666666667 + +in(i+4,j-2) * 0.00223214285714 + +in(i-4,j-1) * -0.00223214285714 + +in(i-3,j-1) * -0.00416666666667 + +in(i-2,j-1) * -0.0104166666667 + +in(i-1,j-1) * -0.0625 + +in(i,j-1) * -0.0625 + +in(i+2,j-1) * 0.0104166666667 + +in(i+3,j-1) * 0.00416666666667 + +in(i+4,j-1) * 0.00223214285714 + +in(i-4,j) * -0.00223214285714 + +in(i-3,j) * -0.00416666666667 + +in(i-2,j) * -0.0104166666667 + +in(i-1,j) * -0.0625 + +in(i+1,j) * 0.0625 + +in(i+2,j) * 0.0104166666667 + +in(i+3,j) * 0.00416666666667 + +in(i+4,j) * 0.00223214285714 + +in(i-4,j+1) * -0.00223214285714 + +in(i-3,j+1) * -0.00416666666667 + +in(i-2,j+1) * -0.0104166666667 + +in(i,j+1) * 0.0625 +in(i+1,j+1) * 0.0625 - +in(i+1,j+2) * 0.0104166666667 - +in(i+1,j+3) * 0.00416666666667 - +in(i+1,j+4) * 0.00223214285714 - +in(i+2,j+-4) * -0.00223214285714 - +in(i+2,j+-3) * -0.00416666666667 - +in(i+2,j+-1) * 0.0104166666667 - +in(i+2,j+0) * 0.0104166666667 +in(i+2,j+1) * 0.0104166666667 - +in(i+2,j+2) * 0.03125 - +in(i+2,j+3) * 0.00416666666667 - +in(i+2,j+4) * 0.00223214285714 - +in(i+3,j+-4) * -0.00223214285714 - +in(i+3,j+-2) * 0.00416666666667 - +in(i+3,j+-1) * 0.00416666666667 - +in(i+3,j+0) * 0.00416666666667 +in(i+3,j+1) * 0.00416666666667 - +in(i+3,j+2) * 0.00416666666667 - +in(i+3,j+3) * 0.0208333333333 - +in(i+3,j+4) * 0.00223214285714 - +in(i+4,j+-3) * 0.00223214285714 - +in(i+4,j+-2) * 0.00223214285714 - +in(i+4,j+-1) * 0.00223214285714 - +in(i+4,j+0) * 0.00223214285714 +in(i+4,j+1) * 0.00223214285714 + +in(i-4,j+2) * -0.00223214285714 + +in(i-3,j+2) * -0.00416666666667 + +in(i-1,j+2) * 0.0104166666667 + +in(i,j+2) * 0.0104166666667 + +in(i+1,j+2) * 0.0104166666667 + +in(i+2,j+2) * 0.03125 + +in(i+3,j+2) * 0.00416666666667 +in(i+4,j+2) * 0.00223214285714 + +in(i-4,j+3) * -0.00223214285714 + +in(i-2,j+3) * 0.00416666666667 + +in(i-1,j+3) * 0.00416666666667 + +in(i,j+3) * 0.00416666666667 + +in(i+1,j+3) * 0.00416666666667 + +in(i+2,j+3) * 0.00416666666667 + +in(i+3,j+3) * 0.0208333333333 +in(i+4,j+3) * 0.00223214285714 + +in(i-3,j+4) * 0.00223214285714 + +in(i-2,j+4) * 0.00223214285714 + +in(i-1,j+4) * 0.00223214285714 + +in(i,j+4) * 0.00223214285714 + +in(i+1,j+4) * 0.00223214285714 + +in(i+2,j+4) * 0.00223214285714 + +in(i+3,j+4) * 0.00223214285714 +in(i+4,j+4) * 0.015625 ; }); @@ -259,115 +259,115 @@ void grid4(const int n, const int t, matrix & in, matrix & out) { void grid5(const int n, const int t, matrix & in, matrix & out) { auto inside = Kokkos::MDRangePolicy>({5,5},{n-5,n-5},{t,t}); Kokkos::parallel_for(inside, KOKKOS_LAMBDA(int i, int j) { - out(i,j) += +in(i+-5,j+-5) * -0.01 - +in(i+-5,j+-4) * -0.00111111111111 - +in(i+-5,j+-3) * -0.00111111111111 - +in(i+-5,j+-2) * -0.00111111111111 - +in(i+-5,j+-1) * -0.00111111111111 - +in(i+-5,j+0) * -0.00111111111111 - +in(i+-5,j+1) * -0.00111111111111 - +in(i+-5,j+2) * -0.00111111111111 - +in(i+-5,j+3) * -0.00111111111111 - +in(i+-5,j+4) * -0.00111111111111 - +in(i+-4,j+-5) * -0.00111111111111 - +in(i+-4,j+-4) * -0.0125 - +in(i+-4,j+-3) * -0.00178571428571 - +in(i+-4,j+-2) * -0.00178571428571 - +in(i+-4,j+-1) * -0.00178571428571 - +in(i+-4,j+0) * -0.00178571428571 - +in(i+-4,j+1) * -0.00178571428571 - +in(i+-4,j+2) * -0.00178571428571 - +in(i+-4,j+3) * -0.00178571428571 - +in(i+-4,j+5) * 0.00111111111111 - +in(i+-3,j+-5) * -0.00111111111111 - +in(i+-3,j+-4) * -0.00178571428571 - +in(i+-3,j+-3) * -0.0166666666667 - +in(i+-3,j+-2) * -0.00333333333333 - +in(i+-3,j+-1) * -0.00333333333333 - +in(i+-3,j+0) * -0.00333333333333 - +in(i+-3,j+1) * -0.00333333333333 - +in(i+-3,j+2) * -0.00333333333333 - +in(i+-3,j+4) * 0.00178571428571 - +in(i+-3,j+5) * 0.00111111111111 - +in(i+-2,j+-5) * -0.00111111111111 - +in(i+-2,j+-4) * -0.00178571428571 - +in(i+-2,j+-3) * -0.00333333333333 - +in(i+-2,j+-2) * -0.025 - +in(i+-2,j+-1) * -0.00833333333333 - +in(i+-2,j+0) * -0.00833333333333 - +in(i+-2,j+1) * -0.00833333333333 - +in(i+-2,j+3) * 0.00333333333333 - +in(i+-2,j+4) * 0.00178571428571 - +in(i+-2,j+5) * 0.00111111111111 - +in(i+-1,j+-5) * -0.00111111111111 - +in(i+-1,j+-4) * -0.00178571428571 - +in(i+-1,j+-3) * -0.00333333333333 - +in(i+-1,j+-2) * -0.00833333333333 - +in(i+-1,j+-1) * -0.05 - +in(i+-1,j+0) * -0.05 - +in(i+-1,j+2) * 0.00833333333333 - +in(i+-1,j+3) * 0.00333333333333 - +in(i+-1,j+4) * 0.00178571428571 - +in(i+-1,j+5) * 0.00111111111111 - +in(i+0,j+-5) * -0.00111111111111 - +in(i+0,j+-4) * -0.00178571428571 - +in(i+0,j+-3) * -0.00333333333333 - +in(i+0,j+-2) * -0.00833333333333 - +in(i+0,j+-1) * -0.05 - +in(i+0,j+1) * 0.05 - +in(i+0,j+2) * 0.00833333333333 - +in(i+0,j+3) * 0.00333333333333 - +in(i+0,j+4) * 0.00178571428571 - +in(i+0,j+5) * 0.00111111111111 - +in(i+1,j+-5) * -0.00111111111111 - +in(i+1,j+-4) * -0.00178571428571 - +in(i+1,j+-3) * -0.00333333333333 - +in(i+1,j+-2) * -0.00833333333333 - +in(i+1,j+0) * 0.05 + out(i,j) += +in(i-5,j-5) * -0.01 + +in(i-4,j-5) * -0.00111111111111 + +in(i-3,j-5) * -0.00111111111111 + +in(i-2,j-5) * -0.00111111111111 + +in(i-1,j-5) * -0.00111111111111 + +in(i,j-5) * -0.00111111111111 + +in(i+1,j-5) * -0.00111111111111 + +in(i+2,j-5) * -0.00111111111111 + +in(i+3,j-5) * -0.00111111111111 + +in(i+4,j-5) * -0.00111111111111 + +in(i-5,j-4) * -0.00111111111111 + +in(i-4,j-4) * -0.0125 + +in(i-3,j-4) * -0.00178571428571 + +in(i-2,j-4) * -0.00178571428571 + +in(i-1,j-4) * -0.00178571428571 + +in(i,j-4) * -0.00178571428571 + +in(i+1,j-4) * -0.00178571428571 + +in(i+2,j-4) * -0.00178571428571 + +in(i+3,j-4) * -0.00178571428571 + +in(i+5,j-4) * 0.00111111111111 + +in(i-5,j-3) * -0.00111111111111 + +in(i-4,j-3) * -0.00178571428571 + +in(i-3,j-3) * -0.0166666666667 + +in(i-2,j-3) * -0.00333333333333 + +in(i-1,j-3) * -0.00333333333333 + +in(i,j-3) * -0.00333333333333 + +in(i+1,j-3) * -0.00333333333333 + +in(i+2,j-3) * -0.00333333333333 + +in(i+4,j-3) * 0.00178571428571 + +in(i+5,j-3) * 0.00111111111111 + +in(i-5,j-2) * -0.00111111111111 + +in(i-4,j-2) * -0.00178571428571 + +in(i-3,j-2) * -0.00333333333333 + +in(i-2,j-2) * -0.025 + +in(i-1,j-2) * -0.00833333333333 + +in(i,j-2) * -0.00833333333333 + +in(i+1,j-2) * -0.00833333333333 + +in(i+3,j-2) * 0.00333333333333 + +in(i+4,j-2) * 0.00178571428571 + +in(i+5,j-2) * 0.00111111111111 + +in(i-5,j-1) * -0.00111111111111 + +in(i-4,j-1) * -0.00178571428571 + +in(i-3,j-1) * -0.00333333333333 + +in(i-2,j-1) * -0.00833333333333 + +in(i-1,j-1) * -0.05 + +in(i,j-1) * -0.05 + +in(i+2,j-1) * 0.00833333333333 + +in(i+3,j-1) * 0.00333333333333 + +in(i+4,j-1) * 0.00178571428571 + +in(i+5,j-1) * 0.00111111111111 + +in(i-5,j) * -0.00111111111111 + +in(i-4,j) * -0.00178571428571 + +in(i-3,j) * -0.00333333333333 + +in(i-2,j) * -0.00833333333333 + +in(i-1,j) * -0.05 + +in(i+1,j) * 0.05 + +in(i+2,j) * 0.00833333333333 + +in(i+3,j) * 0.00333333333333 + +in(i+4,j) * 0.00178571428571 + +in(i+5,j) * 0.00111111111111 + +in(i-5,j+1) * -0.00111111111111 + +in(i-4,j+1) * -0.00178571428571 + +in(i-3,j+1) * -0.00333333333333 + +in(i-2,j+1) * -0.00833333333333 + +in(i,j+1) * 0.05 +in(i+1,j+1) * 0.05 - +in(i+1,j+2) * 0.00833333333333 - +in(i+1,j+3) * 0.00333333333333 - +in(i+1,j+4) * 0.00178571428571 - +in(i+1,j+5) * 0.00111111111111 - +in(i+2,j+-5) * -0.00111111111111 - +in(i+2,j+-4) * -0.00178571428571 - +in(i+2,j+-3) * -0.00333333333333 - +in(i+2,j+-1) * 0.00833333333333 - +in(i+2,j+0) * 0.00833333333333 +in(i+2,j+1) * 0.00833333333333 - +in(i+2,j+2) * 0.025 - +in(i+2,j+3) * 0.00333333333333 - +in(i+2,j+4) * 0.00178571428571 - +in(i+2,j+5) * 0.00111111111111 - +in(i+3,j+-5) * -0.00111111111111 - +in(i+3,j+-4) * -0.00178571428571 - +in(i+3,j+-2) * 0.00333333333333 - +in(i+3,j+-1) * 0.00333333333333 - +in(i+3,j+0) * 0.00333333333333 +in(i+3,j+1) * 0.00333333333333 - +in(i+3,j+2) * 0.00333333333333 - +in(i+3,j+3) * 0.0166666666667 - +in(i+3,j+4) * 0.00178571428571 - +in(i+3,j+5) * 0.00111111111111 - +in(i+4,j+-5) * -0.00111111111111 - +in(i+4,j+-3) * 0.00178571428571 - +in(i+4,j+-2) * 0.00178571428571 - +in(i+4,j+-1) * 0.00178571428571 - +in(i+4,j+0) * 0.00178571428571 +in(i+4,j+1) * 0.00178571428571 - +in(i+4,j+2) * 0.00178571428571 - +in(i+4,j+3) * 0.00178571428571 - +in(i+4,j+4) * 0.0125 - +in(i+4,j+5) * 0.00111111111111 - +in(i+5,j+-4) * 0.00111111111111 - +in(i+5,j+-3) * 0.00111111111111 - +in(i+5,j+-2) * 0.00111111111111 - +in(i+5,j+-1) * 0.00111111111111 - +in(i+5,j+0) * 0.00111111111111 +in(i+5,j+1) * 0.00111111111111 + +in(i-5,j+2) * -0.00111111111111 + +in(i-4,j+2) * -0.00178571428571 + +in(i-3,j+2) * -0.00333333333333 + +in(i-1,j+2) * 0.00833333333333 + +in(i,j+2) * 0.00833333333333 + +in(i+1,j+2) * 0.00833333333333 + +in(i+2,j+2) * 0.025 + +in(i+3,j+2) * 0.00333333333333 + +in(i+4,j+2) * 0.00178571428571 +in(i+5,j+2) * 0.00111111111111 + +in(i-5,j+3) * -0.00111111111111 + +in(i-4,j+3) * -0.00178571428571 + +in(i-2,j+3) * 0.00333333333333 + +in(i-1,j+3) * 0.00333333333333 + +in(i,j+3) * 0.00333333333333 + +in(i+1,j+3) * 0.00333333333333 + +in(i+2,j+3) * 0.00333333333333 + +in(i+3,j+3) * 0.0166666666667 + +in(i+4,j+3) * 0.00178571428571 +in(i+5,j+3) * 0.00111111111111 + +in(i-5,j+4) * -0.00111111111111 + +in(i-3,j+4) * 0.00178571428571 + +in(i-2,j+4) * 0.00178571428571 + +in(i-1,j+4) * 0.00178571428571 + +in(i,j+4) * 0.00178571428571 + +in(i+1,j+4) * 0.00178571428571 + +in(i+2,j+4) * 0.00178571428571 + +in(i+3,j+4) * 0.00178571428571 + +in(i+4,j+4) * 0.0125 +in(i+5,j+4) * 0.00111111111111 + +in(i-4,j+5) * 0.00111111111111 + +in(i-3,j+5) * 0.00111111111111 + +in(i-2,j+5) * 0.00111111111111 + +in(i-1,j+5) * 0.00111111111111 + +in(i,j+5) * 0.00111111111111 + +in(i+1,j+5) * 0.00111111111111 + +in(i+2,j+5) * 0.00111111111111 + +in(i+3,j+5) * 0.00111111111111 + +in(i+4,j+5) * 0.00111111111111 +in(i+5,j+5) * 0.01 ; }); diff --git a/Cxx11/stencil_openmp.hpp b/Cxx11/stencil_openmp.hpp index 2a42b437c..009a32bc2 100644 --- a/Cxx11/stencil_openmp.hpp +++ b/Cxx11/stencil_openmp.hpp @@ -5,10 +5,10 @@ void star1(const int n, const int t, const double * RESTRICT in, double * RESTRI for (auto i=it; i & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-1)*n+(j+0)] * -0.5 - +in[(i+0)*n+(j+-1)] * -0.5 - +in[(i+0)*n+(j+1)] * 0.5 - +in[(i+1)*n+(j+0)] * 0.5; + out[i*n+j] += +in[(i)*n+(j-1)] * -0.5 + +in[(i-1)*n+(j)] * -0.5 + +in[(i+1)*n+(j)] * 0.5 + +in[(i)*n+(j+1)] * 0.5; }); }); } @@ -12,14 +12,14 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-2)*n+(j+0)] * -0.125 - +in[(i+-1)*n+(j+0)] * -0.25 - +in[(i+0)*n+(j+-2)] * -0.125 - +in[(i+0)*n+(j+-1)] * -0.25 - +in[(i+0)*n+(j+1)] * 0.25 - +in[(i+0)*n+(j+2)] * 0.125 - +in[(i+1)*n+(j+0)] * 0.25 - +in[(i+2)*n+(j+0)] * 0.125; + out[i*n+j] += +in[(i)*n+(j-2)] * -0.125 + +in[(i)*n+(j-1)] * -0.25 + +in[(i-2)*n+(j)] * -0.125 + +in[(i-1)*n+(j)] * -0.25 + +in[(i+1)*n+(j)] * 0.25 + +in[(i+2)*n+(j)] * 0.125 + +in[(i)*n+(j+1)] * 0.25 + +in[(i)*n+(j+2)] * 0.125; }); }); } @@ -27,18 +27,18 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-3)*n+(j+0)] * -0.0555555555556 - +in[(i+-2)*n+(j+0)] * -0.0833333333333 - +in[(i+-1)*n+(j+0)] * -0.166666666667 - +in[(i+0)*n+(j+-3)] * -0.0555555555556 - +in[(i+0)*n+(j+-2)] * -0.0833333333333 - +in[(i+0)*n+(j+-1)] * -0.166666666667 - +in[(i+0)*n+(j+1)] * 0.166666666667 - +in[(i+0)*n+(j+2)] * 0.0833333333333 - +in[(i+0)*n+(j+3)] * 0.0555555555556 - +in[(i+1)*n+(j+0)] * 0.166666666667 - +in[(i+2)*n+(j+0)] * 0.0833333333333 - +in[(i+3)*n+(j+0)] * 0.0555555555556; + out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556 + +in[(i)*n+(j-2)] * -0.0833333333333 + +in[(i)*n+(j-1)] * -0.166666666667 + +in[(i-3)*n+(j)] * -0.0555555555556 + +in[(i-2)*n+(j)] * -0.0833333333333 + +in[(i-1)*n+(j)] * -0.166666666667 + +in[(i+1)*n+(j)] * 0.166666666667 + +in[(i+2)*n+(j)] * 0.0833333333333 + +in[(i+3)*n+(j)] * 0.0555555555556 + +in[(i)*n+(j+1)] * 0.166666666667 + +in[(i)*n+(j+2)] * 0.0833333333333 + +in[(i)*n+(j+3)] * 0.0555555555556; }); }); } @@ -46,22 +46,22 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-4)*n+(j+0)] * -0.03125 - +in[(i+-3)*n+(j+0)] * -0.0416666666667 - +in[(i+-2)*n+(j+0)] * -0.0625 - +in[(i+-1)*n+(j+0)] * -0.125 - +in[(i+0)*n+(j+-4)] * -0.03125 - +in[(i+0)*n+(j+-3)] * -0.0416666666667 - +in[(i+0)*n+(j+-2)] * -0.0625 - +in[(i+0)*n+(j+-1)] * -0.125 - +in[(i+0)*n+(j+1)] * 0.125 - +in[(i+0)*n+(j+2)] * 0.0625 - +in[(i+0)*n+(j+3)] * 0.0416666666667 - +in[(i+0)*n+(j+4)] * 0.03125 - +in[(i+1)*n+(j+0)] * 0.125 - +in[(i+2)*n+(j+0)] * 0.0625 - +in[(i+3)*n+(j+0)] * 0.0416666666667 - +in[(i+4)*n+(j+0)] * 0.03125; + out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125 + +in[(i)*n+(j-3)] * -0.0416666666667 + +in[(i)*n+(j-2)] * -0.0625 + +in[(i)*n+(j-1)] * -0.125 + +in[(i-4)*n+(j)] * -0.03125 + +in[(i-3)*n+(j)] * -0.0416666666667 + +in[(i-2)*n+(j)] * -0.0625 + +in[(i-1)*n+(j)] * -0.125 + +in[(i+1)*n+(j)] * 0.125 + +in[(i+2)*n+(j)] * 0.0625 + +in[(i+3)*n+(j)] * 0.0416666666667 + +in[(i+4)*n+(j)] * 0.03125 + +in[(i)*n+(j+1)] * 0.125 + +in[(i)*n+(j+2)] * 0.0625 + +in[(i)*n+(j+3)] * 0.0416666666667 + +in[(i)*n+(j+4)] * 0.03125; }); }); } @@ -69,26 +69,26 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-5)*n+(j+0)] * -0.02 - +in[(i+-4)*n+(j+0)] * -0.025 - +in[(i+-3)*n+(j+0)] * -0.0333333333333 - +in[(i+-2)*n+(j+0)] * -0.05 - +in[(i+-1)*n+(j+0)] * -0.1 - +in[(i+0)*n+(j+-5)] * -0.02 - +in[(i+0)*n+(j+-4)] * -0.025 - +in[(i+0)*n+(j+-3)] * -0.0333333333333 - +in[(i+0)*n+(j+-2)] * -0.05 - +in[(i+0)*n+(j+-1)] * -0.1 - +in[(i+0)*n+(j+1)] * 0.1 - +in[(i+0)*n+(j+2)] * 0.05 - +in[(i+0)*n+(j+3)] * 0.0333333333333 - +in[(i+0)*n+(j+4)] * 0.025 - +in[(i+0)*n+(j+5)] * 0.02 - +in[(i+1)*n+(j+0)] * 0.1 - +in[(i+2)*n+(j+0)] * 0.05 - +in[(i+3)*n+(j+0)] * 0.0333333333333 - +in[(i+4)*n+(j+0)] * 0.025 - +in[(i+5)*n+(j+0)] * 0.02; + out[i*n+j] += +in[(i)*n+(j-5)] * -0.02 + +in[(i)*n+(j-4)] * -0.025 + +in[(i)*n+(j-3)] * -0.0333333333333 + +in[(i)*n+(j-2)] * -0.05 + +in[(i)*n+(j-1)] * -0.1 + +in[(i-5)*n+(j)] * -0.02 + +in[(i-4)*n+(j)] * -0.025 + +in[(i-3)*n+(j)] * -0.0333333333333 + +in[(i-2)*n+(j)] * -0.05 + +in[(i-1)*n+(j)] * -0.1 + +in[(i+1)*n+(j)] * 0.1 + +in[(i+2)*n+(j)] * 0.05 + +in[(i+3)*n+(j)] * 0.0333333333333 + +in[(i+4)*n+(j)] * 0.025 + +in[(i+5)*n+(j)] * 0.02 + +in[(i)*n+(j+1)] * 0.1 + +in[(i)*n+(j+2)] * 0.05 + +in[(i)*n+(j+3)] * 0.0333333333333 + +in[(i)*n+(j+4)] * 0.025 + +in[(i)*n+(j+5)] * 0.02; }); }); } @@ -96,11 +96,11 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-1)*n+(j+-1)] * -0.25 - +in[(i+-1)*n+(j+0)] * -0.25 - +in[(i+0)*n+(j+-1)] * -0.25 - +in[(i+0)*n+(j+1)] * 0.25 - +in[(i+1)*n+(j+0)] * 0.25 + out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25 + +in[(i)*n+(j-1)] * -0.25 + +in[(i-1)*n+(j)] * -0.25 + +in[(i+1)*n+(j)] * 0.25 + +in[(i)*n+(j+1)] * 0.25 +in[(i+1)*n+(j+1)] * 0.25 ; }); @@ -110,25 +110,25 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-2)*n+(j+-2)] * -0.0625 - +in[(i+-2)*n+(j+-1)] * -0.0208333333333 - +in[(i+-2)*n+(j+0)] * -0.0208333333333 - +in[(i+-2)*n+(j+1)] * -0.0208333333333 - +in[(i+-1)*n+(j+-2)] * -0.0208333333333 - +in[(i+-1)*n+(j+-1)] * -0.125 - +in[(i+-1)*n+(j+0)] * -0.125 - +in[(i+-1)*n+(j+2)] * 0.0208333333333 - +in[(i+0)*n+(j+-2)] * -0.0208333333333 - +in[(i+0)*n+(j+-1)] * -0.125 - +in[(i+0)*n+(j+1)] * 0.125 - +in[(i+0)*n+(j+2)] * 0.0208333333333 - +in[(i+1)*n+(j+-2)] * -0.0208333333333 - +in[(i+1)*n+(j+0)] * 0.125 + out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625 + +in[(i-1)*n+(j-2)] * -0.0208333333333 + +in[(i)*n+(j-2)] * -0.0208333333333 + +in[(i+1)*n+(j-2)] * -0.0208333333333 + +in[(i-2)*n+(j-1)] * -0.0208333333333 + +in[(i-1)*n+(j-1)] * -0.125 + +in[(i)*n+(j-1)] * -0.125 + +in[(i+2)*n+(j-1)] * 0.0208333333333 + +in[(i-2)*n+(j)] * -0.0208333333333 + +in[(i-1)*n+(j)] * -0.125 + +in[(i+1)*n+(j)] * 0.125 + +in[(i+2)*n+(j)] * 0.0208333333333 + +in[(i-2)*n+(j+1)] * -0.0208333333333 + +in[(i)*n+(j+1)] * 0.125 +in[(i+1)*n+(j+1)] * 0.125 - +in[(i+1)*n+(j+2)] * 0.0208333333333 - +in[(i+2)*n+(j+-1)] * 0.0208333333333 - +in[(i+2)*n+(j+0)] * 0.0208333333333 +in[(i+2)*n+(j+1)] * 0.0208333333333 + +in[(i-1)*n+(j+2)] * 0.0208333333333 + +in[(i)*n+(j+2)] * 0.0208333333333 + +in[(i+1)*n+(j+2)] * 0.0208333333333 +in[(i+2)*n+(j+2)] * 0.0625 ; }); @@ -138,47 +138,47 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-3)*n+(j+-3)] * -0.0277777777778 - +in[(i+-3)*n+(j+-2)] * -0.00555555555556 - +in[(i+-3)*n+(j+-1)] * -0.00555555555556 - +in[(i+-3)*n+(j+0)] * -0.00555555555556 - +in[(i+-3)*n+(j+1)] * -0.00555555555556 - +in[(i+-3)*n+(j+2)] * -0.00555555555556 - +in[(i+-2)*n+(j+-3)] * -0.00555555555556 - +in[(i+-2)*n+(j+-2)] * -0.0416666666667 - +in[(i+-2)*n+(j+-1)] * -0.0138888888889 - +in[(i+-2)*n+(j+0)] * -0.0138888888889 - +in[(i+-2)*n+(j+1)] * -0.0138888888889 - +in[(i+-2)*n+(j+3)] * 0.00555555555556 - +in[(i+-1)*n+(j+-3)] * -0.00555555555556 - +in[(i+-1)*n+(j+-2)] * -0.0138888888889 - +in[(i+-1)*n+(j+-1)] * -0.0833333333333 - +in[(i+-1)*n+(j+0)] * -0.0833333333333 - +in[(i+-1)*n+(j+2)] * 0.0138888888889 - +in[(i+-1)*n+(j+3)] * 0.00555555555556 - +in[(i+0)*n+(j+-3)] * -0.00555555555556 - +in[(i+0)*n+(j+-2)] * -0.0138888888889 - +in[(i+0)*n+(j+-1)] * -0.0833333333333 - +in[(i+0)*n+(j+1)] * 0.0833333333333 - +in[(i+0)*n+(j+2)] * 0.0138888888889 - +in[(i+0)*n+(j+3)] * 0.00555555555556 - +in[(i+1)*n+(j+-3)] * -0.00555555555556 - +in[(i+1)*n+(j+-2)] * -0.0138888888889 - +in[(i+1)*n+(j+0)] * 0.0833333333333 + out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778 + +in[(i-2)*n+(j-3)] * -0.00555555555556 + +in[(i-1)*n+(j-3)] * -0.00555555555556 + +in[(i)*n+(j-3)] * -0.00555555555556 + +in[(i+1)*n+(j-3)] * -0.00555555555556 + +in[(i+2)*n+(j-3)] * -0.00555555555556 + +in[(i-3)*n+(j-2)] * -0.00555555555556 + +in[(i-2)*n+(j-2)] * -0.0416666666667 + +in[(i-1)*n+(j-2)] * -0.0138888888889 + +in[(i)*n+(j-2)] * -0.0138888888889 + +in[(i+1)*n+(j-2)] * -0.0138888888889 + +in[(i+3)*n+(j-2)] * 0.00555555555556 + +in[(i-3)*n+(j-1)] * -0.00555555555556 + +in[(i-2)*n+(j-1)] * -0.0138888888889 + +in[(i-1)*n+(j-1)] * -0.0833333333333 + +in[(i)*n+(j-1)] * -0.0833333333333 + +in[(i+2)*n+(j-1)] * 0.0138888888889 + +in[(i+3)*n+(j-1)] * 0.00555555555556 + +in[(i-3)*n+(j)] * -0.00555555555556 + +in[(i-2)*n+(j)] * -0.0138888888889 + +in[(i-1)*n+(j)] * -0.0833333333333 + +in[(i+1)*n+(j)] * 0.0833333333333 + +in[(i+2)*n+(j)] * 0.0138888888889 + +in[(i+3)*n+(j)] * 0.00555555555556 + +in[(i-3)*n+(j+1)] * -0.00555555555556 + +in[(i-2)*n+(j+1)] * -0.0138888888889 + +in[(i)*n+(j+1)] * 0.0833333333333 +in[(i+1)*n+(j+1)] * 0.0833333333333 - +in[(i+1)*n+(j+2)] * 0.0138888888889 - +in[(i+1)*n+(j+3)] * 0.00555555555556 - +in[(i+2)*n+(j+-3)] * -0.00555555555556 - +in[(i+2)*n+(j+-1)] * 0.0138888888889 - +in[(i+2)*n+(j+0)] * 0.0138888888889 +in[(i+2)*n+(j+1)] * 0.0138888888889 - +in[(i+2)*n+(j+2)] * 0.0416666666667 - +in[(i+2)*n+(j+3)] * 0.00555555555556 - +in[(i+3)*n+(j+-2)] * 0.00555555555556 - +in[(i+3)*n+(j+-1)] * 0.00555555555556 - +in[(i+3)*n+(j+0)] * 0.00555555555556 +in[(i+3)*n+(j+1)] * 0.00555555555556 + +in[(i-3)*n+(j+2)] * -0.00555555555556 + +in[(i-1)*n+(j+2)] * 0.0138888888889 + +in[(i)*n+(j+2)] * 0.0138888888889 + +in[(i+1)*n+(j+2)] * 0.0138888888889 + +in[(i+2)*n+(j+2)] * 0.0416666666667 +in[(i+3)*n+(j+2)] * 0.00555555555556 + +in[(i-2)*n+(j+3)] * 0.00555555555556 + +in[(i-1)*n+(j+3)] * 0.00555555555556 + +in[(i)*n+(j+3)] * 0.00555555555556 + +in[(i+1)*n+(j+3)] * 0.00555555555556 + +in[(i+2)*n+(j+3)] * 0.00555555555556 +in[(i+3)*n+(j+3)] * 0.0277777777778 ; }); @@ -188,77 +188,77 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-4)*n+(j+-4)] * -0.015625 - +in[(i+-4)*n+(j+-3)] * -0.00223214285714 - +in[(i+-4)*n+(j+-2)] * -0.00223214285714 - +in[(i+-4)*n+(j+-1)] * -0.00223214285714 - +in[(i+-4)*n+(j+0)] * -0.00223214285714 - +in[(i+-4)*n+(j+1)] * -0.00223214285714 - +in[(i+-4)*n+(j+2)] * -0.00223214285714 - +in[(i+-4)*n+(j+3)] * -0.00223214285714 - +in[(i+-3)*n+(j+-4)] * -0.00223214285714 - +in[(i+-3)*n+(j+-3)] * -0.0208333333333 - +in[(i+-3)*n+(j+-2)] * -0.00416666666667 - +in[(i+-3)*n+(j+-1)] * -0.00416666666667 - +in[(i+-3)*n+(j+0)] * -0.00416666666667 - +in[(i+-3)*n+(j+1)] * -0.00416666666667 - +in[(i+-3)*n+(j+2)] * -0.00416666666667 - +in[(i+-3)*n+(j+4)] * 0.00223214285714 - +in[(i+-2)*n+(j+-4)] * -0.00223214285714 - +in[(i+-2)*n+(j+-3)] * -0.00416666666667 - +in[(i+-2)*n+(j+-2)] * -0.03125 - +in[(i+-2)*n+(j+-1)] * -0.0104166666667 - +in[(i+-2)*n+(j+0)] * -0.0104166666667 - +in[(i+-2)*n+(j+1)] * -0.0104166666667 - +in[(i+-2)*n+(j+3)] * 0.00416666666667 - +in[(i+-2)*n+(j+4)] * 0.00223214285714 - +in[(i+-1)*n+(j+-4)] * -0.00223214285714 - +in[(i+-1)*n+(j+-3)] * -0.00416666666667 - +in[(i+-1)*n+(j+-2)] * -0.0104166666667 - +in[(i+-1)*n+(j+-1)] * -0.0625 - +in[(i+-1)*n+(j+0)] * -0.0625 - +in[(i+-1)*n+(j+2)] * 0.0104166666667 - +in[(i+-1)*n+(j+3)] * 0.00416666666667 - +in[(i+-1)*n+(j+4)] * 0.00223214285714 - +in[(i+0)*n+(j+-4)] * -0.00223214285714 - +in[(i+0)*n+(j+-3)] * -0.00416666666667 - +in[(i+0)*n+(j+-2)] * -0.0104166666667 - +in[(i+0)*n+(j+-1)] * -0.0625 - +in[(i+0)*n+(j+1)] * 0.0625 - +in[(i+0)*n+(j+2)] * 0.0104166666667 - +in[(i+0)*n+(j+3)] * 0.00416666666667 - +in[(i+0)*n+(j+4)] * 0.00223214285714 - +in[(i+1)*n+(j+-4)] * -0.00223214285714 - +in[(i+1)*n+(j+-3)] * -0.00416666666667 - +in[(i+1)*n+(j+-2)] * -0.0104166666667 - +in[(i+1)*n+(j+0)] * 0.0625 + out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625 + +in[(i-3)*n+(j-4)] * -0.00223214285714 + +in[(i-2)*n+(j-4)] * -0.00223214285714 + +in[(i-1)*n+(j-4)] * -0.00223214285714 + +in[(i)*n+(j-4)] * -0.00223214285714 + +in[(i+1)*n+(j-4)] * -0.00223214285714 + +in[(i+2)*n+(j-4)] * -0.00223214285714 + +in[(i+3)*n+(j-4)] * -0.00223214285714 + +in[(i-4)*n+(j-3)] * -0.00223214285714 + +in[(i-3)*n+(j-3)] * -0.0208333333333 + +in[(i-2)*n+(j-3)] * -0.00416666666667 + +in[(i-1)*n+(j-3)] * -0.00416666666667 + +in[(i)*n+(j-3)] * -0.00416666666667 + +in[(i+1)*n+(j-3)] * -0.00416666666667 + +in[(i+2)*n+(j-3)] * -0.00416666666667 + +in[(i+4)*n+(j-3)] * 0.00223214285714 + +in[(i-4)*n+(j-2)] * -0.00223214285714 + +in[(i-3)*n+(j-2)] * -0.00416666666667 + +in[(i-2)*n+(j-2)] * -0.03125 + +in[(i-1)*n+(j-2)] * -0.0104166666667 + +in[(i)*n+(j-2)] * -0.0104166666667 + +in[(i+1)*n+(j-2)] * -0.0104166666667 + +in[(i+3)*n+(j-2)] * 0.00416666666667 + +in[(i+4)*n+(j-2)] * 0.00223214285714 + +in[(i-4)*n+(j-1)] * -0.00223214285714 + +in[(i-3)*n+(j-1)] * -0.00416666666667 + +in[(i-2)*n+(j-1)] * -0.0104166666667 + +in[(i-1)*n+(j-1)] * -0.0625 + +in[(i)*n+(j-1)] * -0.0625 + +in[(i+2)*n+(j-1)] * 0.0104166666667 + +in[(i+3)*n+(j-1)] * 0.00416666666667 + +in[(i+4)*n+(j-1)] * 0.00223214285714 + +in[(i-4)*n+(j)] * -0.00223214285714 + +in[(i-3)*n+(j)] * -0.00416666666667 + +in[(i-2)*n+(j)] * -0.0104166666667 + +in[(i-1)*n+(j)] * -0.0625 + +in[(i+1)*n+(j)] * 0.0625 + +in[(i+2)*n+(j)] * 0.0104166666667 + +in[(i+3)*n+(j)] * 0.00416666666667 + +in[(i+4)*n+(j)] * 0.00223214285714 + +in[(i-4)*n+(j+1)] * -0.00223214285714 + +in[(i-3)*n+(j+1)] * -0.00416666666667 + +in[(i-2)*n+(j+1)] * -0.0104166666667 + +in[(i)*n+(j+1)] * 0.0625 +in[(i+1)*n+(j+1)] * 0.0625 - +in[(i+1)*n+(j+2)] * 0.0104166666667 - +in[(i+1)*n+(j+3)] * 0.00416666666667 - +in[(i+1)*n+(j+4)] * 0.00223214285714 - +in[(i+2)*n+(j+-4)] * -0.00223214285714 - +in[(i+2)*n+(j+-3)] * -0.00416666666667 - +in[(i+2)*n+(j+-1)] * 0.0104166666667 - +in[(i+2)*n+(j+0)] * 0.0104166666667 +in[(i+2)*n+(j+1)] * 0.0104166666667 - +in[(i+2)*n+(j+2)] * 0.03125 - +in[(i+2)*n+(j+3)] * 0.00416666666667 - +in[(i+2)*n+(j+4)] * 0.00223214285714 - +in[(i+3)*n+(j+-4)] * -0.00223214285714 - +in[(i+3)*n+(j+-2)] * 0.00416666666667 - +in[(i+3)*n+(j+-1)] * 0.00416666666667 - +in[(i+3)*n+(j+0)] * 0.00416666666667 +in[(i+3)*n+(j+1)] * 0.00416666666667 - +in[(i+3)*n+(j+2)] * 0.00416666666667 - +in[(i+3)*n+(j+3)] * 0.0208333333333 - +in[(i+3)*n+(j+4)] * 0.00223214285714 - +in[(i+4)*n+(j+-3)] * 0.00223214285714 - +in[(i+4)*n+(j+-2)] * 0.00223214285714 - +in[(i+4)*n+(j+-1)] * 0.00223214285714 - +in[(i+4)*n+(j+0)] * 0.00223214285714 +in[(i+4)*n+(j+1)] * 0.00223214285714 + +in[(i-4)*n+(j+2)] * -0.00223214285714 + +in[(i-3)*n+(j+2)] * -0.00416666666667 + +in[(i-1)*n+(j+2)] * 0.0104166666667 + +in[(i)*n+(j+2)] * 0.0104166666667 + +in[(i+1)*n+(j+2)] * 0.0104166666667 + +in[(i+2)*n+(j+2)] * 0.03125 + +in[(i+3)*n+(j+2)] * 0.00416666666667 +in[(i+4)*n+(j+2)] * 0.00223214285714 + +in[(i-4)*n+(j+3)] * -0.00223214285714 + +in[(i-2)*n+(j+3)] * 0.00416666666667 + +in[(i-1)*n+(j+3)] * 0.00416666666667 + +in[(i)*n+(j+3)] * 0.00416666666667 + +in[(i+1)*n+(j+3)] * 0.00416666666667 + +in[(i+2)*n+(j+3)] * 0.00416666666667 + +in[(i+3)*n+(j+3)] * 0.0208333333333 +in[(i+4)*n+(j+3)] * 0.00223214285714 + +in[(i-3)*n+(j+4)] * 0.00223214285714 + +in[(i-2)*n+(j+4)] * 0.00223214285714 + +in[(i-1)*n+(j+4)] * 0.00223214285714 + +in[(i)*n+(j+4)] * 0.00223214285714 + +in[(i+1)*n+(j+4)] * 0.00223214285714 + +in[(i+2)*n+(j+4)] * 0.00223214285714 + +in[(i+3)*n+(j+4)] * 0.00223214285714 +in[(i+4)*n+(j+4)] * 0.015625 ; }); @@ -268,115 +268,115 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) { RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { - out[i*n+j] += +in[(i+-5)*n+(j+-5)] * -0.01 - +in[(i+-5)*n+(j+-4)] * -0.00111111111111 - +in[(i+-5)*n+(j+-3)] * -0.00111111111111 - +in[(i+-5)*n+(j+-2)] * -0.00111111111111 - +in[(i+-5)*n+(j+-1)] * -0.00111111111111 - +in[(i+-5)*n+(j+0)] * -0.00111111111111 - +in[(i+-5)*n+(j+1)] * -0.00111111111111 - +in[(i+-5)*n+(j+2)] * -0.00111111111111 - +in[(i+-5)*n+(j+3)] * -0.00111111111111 - +in[(i+-5)*n+(j+4)] * -0.00111111111111 - +in[(i+-4)*n+(j+-5)] * -0.00111111111111 - +in[(i+-4)*n+(j+-4)] * -0.0125 - +in[(i+-4)*n+(j+-3)] * -0.00178571428571 - +in[(i+-4)*n+(j+-2)] * -0.00178571428571 - +in[(i+-4)*n+(j+-1)] * -0.00178571428571 - +in[(i+-4)*n+(j+0)] * -0.00178571428571 - +in[(i+-4)*n+(j+1)] * -0.00178571428571 - +in[(i+-4)*n+(j+2)] * -0.00178571428571 - +in[(i+-4)*n+(j+3)] * -0.00178571428571 - +in[(i+-4)*n+(j+5)] * 0.00111111111111 - +in[(i+-3)*n+(j+-5)] * -0.00111111111111 - +in[(i+-3)*n+(j+-4)] * -0.00178571428571 - +in[(i+-3)*n+(j+-3)] * -0.0166666666667 - +in[(i+-3)*n+(j+-2)] * -0.00333333333333 - +in[(i+-3)*n+(j+-1)] * -0.00333333333333 - +in[(i+-3)*n+(j+0)] * -0.00333333333333 - +in[(i+-3)*n+(j+1)] * -0.00333333333333 - +in[(i+-3)*n+(j+2)] * -0.00333333333333 - +in[(i+-3)*n+(j+4)] * 0.00178571428571 - +in[(i+-3)*n+(j+5)] * 0.00111111111111 - +in[(i+-2)*n+(j+-5)] * -0.00111111111111 - +in[(i+-2)*n+(j+-4)] * -0.00178571428571 - +in[(i+-2)*n+(j+-3)] * -0.00333333333333 - +in[(i+-2)*n+(j+-2)] * -0.025 - +in[(i+-2)*n+(j+-1)] * -0.00833333333333 - +in[(i+-2)*n+(j+0)] * -0.00833333333333 - +in[(i+-2)*n+(j+1)] * -0.00833333333333 - +in[(i+-2)*n+(j+3)] * 0.00333333333333 - +in[(i+-2)*n+(j+4)] * 0.00178571428571 - +in[(i+-2)*n+(j+5)] * 0.00111111111111 - +in[(i+-1)*n+(j+-5)] * -0.00111111111111 - +in[(i+-1)*n+(j+-4)] * -0.00178571428571 - +in[(i+-1)*n+(j+-3)] * -0.00333333333333 - +in[(i+-1)*n+(j+-2)] * -0.00833333333333 - +in[(i+-1)*n+(j+-1)] * -0.05 - +in[(i+-1)*n+(j+0)] * -0.05 - +in[(i+-1)*n+(j+2)] * 0.00833333333333 - +in[(i+-1)*n+(j+3)] * 0.00333333333333 - +in[(i+-1)*n+(j+4)] * 0.00178571428571 - +in[(i+-1)*n+(j+5)] * 0.00111111111111 - +in[(i+0)*n+(j+-5)] * -0.00111111111111 - +in[(i+0)*n+(j+-4)] * -0.00178571428571 - +in[(i+0)*n+(j+-3)] * -0.00333333333333 - +in[(i+0)*n+(j+-2)] * -0.00833333333333 - +in[(i+0)*n+(j+-1)] * -0.05 - +in[(i+0)*n+(j+1)] * 0.05 - +in[(i+0)*n+(j+2)] * 0.00833333333333 - +in[(i+0)*n+(j+3)] * 0.00333333333333 - +in[(i+0)*n+(j+4)] * 0.00178571428571 - +in[(i+0)*n+(j+5)] * 0.00111111111111 - +in[(i+1)*n+(j+-5)] * -0.00111111111111 - +in[(i+1)*n+(j+-4)] * -0.00178571428571 - +in[(i+1)*n+(j+-3)] * -0.00333333333333 - +in[(i+1)*n+(j+-2)] * -0.00833333333333 - +in[(i+1)*n+(j+0)] * 0.05 + out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01 + +in[(i-4)*n+(j-5)] * -0.00111111111111 + +in[(i-3)*n+(j-5)] * -0.00111111111111 + +in[(i-2)*n+(j-5)] * -0.00111111111111 + +in[(i-1)*n+(j-5)] * -0.00111111111111 + +in[(i)*n+(j-5)] * -0.00111111111111 + +in[(i+1)*n+(j-5)] * -0.00111111111111 + +in[(i+2)*n+(j-5)] * -0.00111111111111 + +in[(i+3)*n+(j-5)] * -0.00111111111111 + +in[(i+4)*n+(j-5)] * -0.00111111111111 + +in[(i-5)*n+(j-4)] * -0.00111111111111 + +in[(i-4)*n+(j-4)] * -0.0125 + +in[(i-3)*n+(j-4)] * -0.00178571428571 + +in[(i-2)*n+(j-4)] * -0.00178571428571 + +in[(i-1)*n+(j-4)] * -0.00178571428571 + +in[(i)*n+(j-4)] * -0.00178571428571 + +in[(i+1)*n+(j-4)] * -0.00178571428571 + +in[(i+2)*n+(j-4)] * -0.00178571428571 + +in[(i+3)*n+(j-4)] * -0.00178571428571 + +in[(i+5)*n+(j-4)] * 0.00111111111111 + +in[(i-5)*n+(j-3)] * -0.00111111111111 + +in[(i-4)*n+(j-3)] * -0.00178571428571 + +in[(i-3)*n+(j-3)] * -0.0166666666667 + +in[(i-2)*n+(j-3)] * -0.00333333333333 + +in[(i-1)*n+(j-3)] * -0.00333333333333 + +in[(i)*n+(j-3)] * -0.00333333333333 + +in[(i+1)*n+(j-3)] * -0.00333333333333 + +in[(i+2)*n+(j-3)] * -0.00333333333333 + +in[(i+4)*n+(j-3)] * 0.00178571428571 + +in[(i+5)*n+(j-3)] * 0.00111111111111 + +in[(i-5)*n+(j-2)] * -0.00111111111111 + +in[(i-4)*n+(j-2)] * -0.00178571428571 + +in[(i-3)*n+(j-2)] * -0.00333333333333 + +in[(i-2)*n+(j-2)] * -0.025 + +in[(i-1)*n+(j-2)] * -0.00833333333333 + +in[(i)*n+(j-2)] * -0.00833333333333 + +in[(i+1)*n+(j-2)] * -0.00833333333333 + +in[(i+3)*n+(j-2)] * 0.00333333333333 + +in[(i+4)*n+(j-2)] * 0.00178571428571 + +in[(i+5)*n+(j-2)] * 0.00111111111111 + +in[(i-5)*n+(j-1)] * -0.00111111111111 + +in[(i-4)*n+(j-1)] * -0.00178571428571 + +in[(i-3)*n+(j-1)] * -0.00333333333333 + +in[(i-2)*n+(j-1)] * -0.00833333333333 + +in[(i-1)*n+(j-1)] * -0.05 + +in[(i)*n+(j-1)] * -0.05 + +in[(i+2)*n+(j-1)] * 0.00833333333333 + +in[(i+3)*n+(j-1)] * 0.00333333333333 + +in[(i+4)*n+(j-1)] * 0.00178571428571 + +in[(i+5)*n+(j-1)] * 0.00111111111111 + +in[(i-5)*n+(j)] * -0.00111111111111 + +in[(i-4)*n+(j)] * -0.00178571428571 + +in[(i-3)*n+(j)] * -0.00333333333333 + +in[(i-2)*n+(j)] * -0.00833333333333 + +in[(i-1)*n+(j)] * -0.05 + +in[(i+1)*n+(j)] * 0.05 + +in[(i+2)*n+(j)] * 0.00833333333333 + +in[(i+3)*n+(j)] * 0.00333333333333 + +in[(i+4)*n+(j)] * 0.00178571428571 + +in[(i+5)*n+(j)] * 0.00111111111111 + +in[(i-5)*n+(j+1)] * -0.00111111111111 + +in[(i-4)*n+(j+1)] * -0.00178571428571 + +in[(i-3)*n+(j+1)] * -0.00333333333333 + +in[(i-2)*n+(j+1)] * -0.00833333333333 + +in[(i)*n+(j+1)] * 0.05 +in[(i+1)*n+(j+1)] * 0.05 - +in[(i+1)*n+(j+2)] * 0.00833333333333 - +in[(i+1)*n+(j+3)] * 0.00333333333333 - +in[(i+1)*n+(j+4)] * 0.00178571428571 - +in[(i+1)*n+(j+5)] * 0.00111111111111 - +in[(i+2)*n+(j+-5)] * -0.00111111111111 - +in[(i+2)*n+(j+-4)] * -0.00178571428571 - +in[(i+2)*n+(j+-3)] * -0.00333333333333 - +in[(i+2)*n+(j+-1)] * 0.00833333333333 - +in[(i+2)*n+(j+0)] * 0.00833333333333 +in[(i+2)*n+(j+1)] * 0.00833333333333 - +in[(i+2)*n+(j+2)] * 0.025 - +in[(i+2)*n+(j+3)] * 0.00333333333333 - +in[(i+2)*n+(j+4)] * 0.00178571428571 - +in[(i+2)*n+(j+5)] * 0.00111111111111 - +in[(i+3)*n+(j+-5)] * -0.00111111111111 - +in[(i+3)*n+(j+-4)] * -0.00178571428571 - +in[(i+3)*n+(j+-2)] * 0.00333333333333 - +in[(i+3)*n+(j+-1)] * 0.00333333333333 - +in[(i+3)*n+(j+0)] * 0.00333333333333 +in[(i+3)*n+(j+1)] * 0.00333333333333 - +in[(i+3)*n+(j+2)] * 0.00333333333333 - +in[(i+3)*n+(j+3)] * 0.0166666666667 - +in[(i+3)*n+(j+4)] * 0.00178571428571 - +in[(i+3)*n+(j+5)] * 0.00111111111111 - +in[(i+4)*n+(j+-5)] * -0.00111111111111 - +in[(i+4)*n+(j+-3)] * 0.00178571428571 - +in[(i+4)*n+(j+-2)] * 0.00178571428571 - +in[(i+4)*n+(j+-1)] * 0.00178571428571 - +in[(i+4)*n+(j+0)] * 0.00178571428571 +in[(i+4)*n+(j+1)] * 0.00178571428571 - +in[(i+4)*n+(j+2)] * 0.00178571428571 - +in[(i+4)*n+(j+3)] * 0.00178571428571 - +in[(i+4)*n+(j+4)] * 0.0125 - +in[(i+4)*n+(j+5)] * 0.00111111111111 - +in[(i+5)*n+(j+-4)] * 0.00111111111111 - +in[(i+5)*n+(j+-3)] * 0.00111111111111 - +in[(i+5)*n+(j+-2)] * 0.00111111111111 - +in[(i+5)*n+(j+-1)] * 0.00111111111111 - +in[(i+5)*n+(j+0)] * 0.00111111111111 +in[(i+5)*n+(j+1)] * 0.00111111111111 + +in[(i-5)*n+(j+2)] * -0.00111111111111 + +in[(i-4)*n+(j+2)] * -0.00178571428571 + +in[(i-3)*n+(j+2)] * -0.00333333333333 + +in[(i-1)*n+(j+2)] * 0.00833333333333 + +in[(i)*n+(j+2)] * 0.00833333333333 + +in[(i+1)*n+(j+2)] * 0.00833333333333 + +in[(i+2)*n+(j+2)] * 0.025 + +in[(i+3)*n+(j+2)] * 0.00333333333333 + +in[(i+4)*n+(j+2)] * 0.00178571428571 +in[(i+5)*n+(j+2)] * 0.00111111111111 + +in[(i-5)*n+(j+3)] * -0.00111111111111 + +in[(i-4)*n+(j+3)] * -0.00178571428571 + +in[(i-2)*n+(j+3)] * 0.00333333333333 + +in[(i-1)*n+(j+3)] * 0.00333333333333 + +in[(i)*n+(j+3)] * 0.00333333333333 + +in[(i+1)*n+(j+3)] * 0.00333333333333 + +in[(i+2)*n+(j+3)] * 0.00333333333333 + +in[(i+3)*n+(j+3)] * 0.0166666666667 + +in[(i+4)*n+(j+3)] * 0.00178571428571 +in[(i+5)*n+(j+3)] * 0.00111111111111 + +in[(i-5)*n+(j+4)] * -0.00111111111111 + +in[(i-3)*n+(j+4)] * 0.00178571428571 + +in[(i-2)*n+(j+4)] * 0.00178571428571 + +in[(i-1)*n+(j+4)] * 0.00178571428571 + +in[(i)*n+(j+4)] * 0.00178571428571 + +in[(i+1)*n+(j+4)] * 0.00178571428571 + +in[(i+2)*n+(j+4)] * 0.00178571428571 + +in[(i+3)*n+(j+4)] * 0.00178571428571 + +in[(i+4)*n+(j+4)] * 0.0125 +in[(i+5)*n+(j+4)] * 0.00111111111111 + +in[(i-4)*n+(j+5)] * 0.00111111111111 + +in[(i-3)*n+(j+5)] * 0.00111111111111 + +in[(i-2)*n+(j+5)] * 0.00111111111111 + +in[(i-1)*n+(j+5)] * 0.00111111111111 + +in[(i)*n+(j+5)] * 0.00111111111111 + +in[(i+1)*n+(j+5)] * 0.00111111111111 + +in[(i+2)*n+(j+5)] * 0.00111111111111 + +in[(i+3)*n+(j+5)] * 0.00111111111111 + +in[(i+4)*n+(j+5)] * 0.00111111111111 +in[(i+5)*n+(j+5)] * 0.01 ; }); diff --git a/Cxx11/stencil_rajaview.hpp b/Cxx11/stencil_rajaview.hpp index 4a521770f..e6c9c6565 100644 --- a/Cxx11/stencil_rajaview.hpp +++ b/Cxx11/stencil_rajaview.hpp @@ -6,10 +6,10 @@ void star1(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(1,n-1); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-1,j+0) * -0.5 - +in(i+0,j+-1) * -0.5 - +in(i+0,j+1) * 0.5 - +in(i+1,j+0) * 0.5; + out(i,j) += +in(i,j-1) * -0.5 + +in(i-1,j) * -0.5 + +in(i+1,j) * 0.5 + +in(i,j+1) * 0.5; }); } @@ -17,14 +17,14 @@ void star2(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(2,n-2); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-2,j+0) * -0.125 - +in(i+-1,j+0) * -0.25 - +in(i+0,j+-2) * -0.125 - +in(i+0,j+-1) * -0.25 - +in(i+0,j+1) * 0.25 - +in(i+0,j+2) * 0.125 - +in(i+1,j+0) * 0.25 - +in(i+2,j+0) * 0.125; + out(i,j) += +in(i,j-2) * -0.125 + +in(i,j-1) * -0.25 + +in(i-2,j) * -0.125 + +in(i-1,j) * -0.25 + +in(i+1,j) * 0.25 + +in(i+2,j) * 0.125 + +in(i,j+1) * 0.25 + +in(i,j+2) * 0.125; }); } @@ -32,18 +32,18 @@ void star3(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(3,n-3); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-3,j+0) * -0.0555555555556 - +in(i+-2,j+0) * -0.0833333333333 - +in(i+-1,j+0) * -0.166666666667 - +in(i+0,j+-3) * -0.0555555555556 - +in(i+0,j+-2) * -0.0833333333333 - +in(i+0,j+-1) * -0.166666666667 - +in(i+0,j+1) * 0.166666666667 - +in(i+0,j+2) * 0.0833333333333 - +in(i+0,j+3) * 0.0555555555556 - +in(i+1,j+0) * 0.166666666667 - +in(i+2,j+0) * 0.0833333333333 - +in(i+3,j+0) * 0.0555555555556; + out(i,j) += +in(i,j-3) * -0.0555555555556 + +in(i,j-2) * -0.0833333333333 + +in(i,j-1) * -0.166666666667 + +in(i-3,j) * -0.0555555555556 + +in(i-2,j) * -0.0833333333333 + +in(i-1,j) * -0.166666666667 + +in(i+1,j) * 0.166666666667 + +in(i+2,j) * 0.0833333333333 + +in(i+3,j) * 0.0555555555556 + +in(i,j+1) * 0.166666666667 + +in(i,j+2) * 0.0833333333333 + +in(i,j+3) * 0.0555555555556; }); } @@ -51,22 +51,22 @@ void star4(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(4,n-4); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-4,j+0) * -0.03125 - +in(i+-3,j+0) * -0.0416666666667 - +in(i+-2,j+0) * -0.0625 - +in(i+-1,j+0) * -0.125 - +in(i+0,j+-4) * -0.03125 - +in(i+0,j+-3) * -0.0416666666667 - +in(i+0,j+-2) * -0.0625 - +in(i+0,j+-1) * -0.125 - +in(i+0,j+1) * 0.125 - +in(i+0,j+2) * 0.0625 - +in(i+0,j+3) * 0.0416666666667 - +in(i+0,j+4) * 0.03125 - +in(i+1,j+0) * 0.125 - +in(i+2,j+0) * 0.0625 - +in(i+3,j+0) * 0.0416666666667 - +in(i+4,j+0) * 0.03125; + out(i,j) += +in(i,j-4) * -0.03125 + +in(i,j-3) * -0.0416666666667 + +in(i,j-2) * -0.0625 + +in(i,j-1) * -0.125 + +in(i-4,j) * -0.03125 + +in(i-3,j) * -0.0416666666667 + +in(i-2,j) * -0.0625 + +in(i-1,j) * -0.125 + +in(i+1,j) * 0.125 + +in(i+2,j) * 0.0625 + +in(i+3,j) * 0.0416666666667 + +in(i+4,j) * 0.03125 + +in(i,j+1) * 0.125 + +in(i,j+2) * 0.0625 + +in(i,j+3) * 0.0416666666667 + +in(i,j+4) * 0.03125; }); } @@ -74,26 +74,26 @@ void star5(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(5,n-5); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-5,j+0) * -0.02 - +in(i+-4,j+0) * -0.025 - +in(i+-3,j+0) * -0.0333333333333 - +in(i+-2,j+0) * -0.05 - +in(i+-1,j+0) * -0.1 - +in(i+0,j+-5) * -0.02 - +in(i+0,j+-4) * -0.025 - +in(i+0,j+-3) * -0.0333333333333 - +in(i+0,j+-2) * -0.05 - +in(i+0,j+-1) * -0.1 - +in(i+0,j+1) * 0.1 - +in(i+0,j+2) * 0.05 - +in(i+0,j+3) * 0.0333333333333 - +in(i+0,j+4) * 0.025 - +in(i+0,j+5) * 0.02 - +in(i+1,j+0) * 0.1 - +in(i+2,j+0) * 0.05 - +in(i+3,j+0) * 0.0333333333333 - +in(i+4,j+0) * 0.025 - +in(i+5,j+0) * 0.02; + out(i,j) += +in(i,j-5) * -0.02 + +in(i,j-4) * -0.025 + +in(i,j-3) * -0.0333333333333 + +in(i,j-2) * -0.05 + +in(i,j-1) * -0.1 + +in(i-5,j) * -0.02 + +in(i-4,j) * -0.025 + +in(i-3,j) * -0.0333333333333 + +in(i-2,j) * -0.05 + +in(i-1,j) * -0.1 + +in(i+1,j) * 0.1 + +in(i+2,j) * 0.05 + +in(i+3,j) * 0.0333333333333 + +in(i+4,j) * 0.025 + +in(i+5,j) * 0.02 + +in(i,j+1) * 0.1 + +in(i,j+2) * 0.05 + +in(i,j+3) * 0.0333333333333 + +in(i,j+4) * 0.025 + +in(i,j+5) * 0.02; }); } @@ -101,11 +101,11 @@ void grid1(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(1,n-1); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-1,j+-1) * -0.25 - +in(i+-1,j+0) * -0.25 - +in(i+0,j+-1) * -0.25 - +in(i+0,j+1) * 0.25 - +in(i+1,j+0) * 0.25 + out(i,j) += +in(i-1,j-1) * -0.25 + +in(i,j-1) * -0.25 + +in(i-1,j) * -0.25 + +in(i+1,j) * 0.25 + +in(i,j+1) * 0.25 +in(i+1,j+1) * 0.25 ; }); @@ -115,25 +115,25 @@ void grid2(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(2,n-2); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-2,j+-2) * -0.0625 - +in(i+-2,j+-1) * -0.0208333333333 - +in(i+-2,j+0) * -0.0208333333333 - +in(i+-2,j+1) * -0.0208333333333 - +in(i+-1,j+-2) * -0.0208333333333 - +in(i+-1,j+-1) * -0.125 - +in(i+-1,j+0) * -0.125 - +in(i+-1,j+2) * 0.0208333333333 - +in(i+0,j+-2) * -0.0208333333333 - +in(i+0,j+-1) * -0.125 - +in(i+0,j+1) * 0.125 - +in(i+0,j+2) * 0.0208333333333 - +in(i+1,j+-2) * -0.0208333333333 - +in(i+1,j+0) * 0.125 + out(i,j) += +in(i-2,j-2) * -0.0625 + +in(i-1,j-2) * -0.0208333333333 + +in(i,j-2) * -0.0208333333333 + +in(i+1,j-2) * -0.0208333333333 + +in(i-2,j-1) * -0.0208333333333 + +in(i-1,j-1) * -0.125 + +in(i,j-1) * -0.125 + +in(i+2,j-1) * 0.0208333333333 + +in(i-2,j) * -0.0208333333333 + +in(i-1,j) * -0.125 + +in(i+1,j) * 0.125 + +in(i+2,j) * 0.0208333333333 + +in(i-2,j+1) * -0.0208333333333 + +in(i,j+1) * 0.125 +in(i+1,j+1) * 0.125 - +in(i+1,j+2) * 0.0208333333333 - +in(i+2,j+-1) * 0.0208333333333 - +in(i+2,j+0) * 0.0208333333333 +in(i+2,j+1) * 0.0208333333333 + +in(i-1,j+2) * 0.0208333333333 + +in(i,j+2) * 0.0208333333333 + +in(i+1,j+2) * 0.0208333333333 +in(i+2,j+2) * 0.0625 ; }); @@ -143,47 +143,47 @@ void grid3(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(3,n-3); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-3,j+-3) * -0.0277777777778 - +in(i+-3,j+-2) * -0.00555555555556 - +in(i+-3,j+-1) * -0.00555555555556 - +in(i+-3,j+0) * -0.00555555555556 - +in(i+-3,j+1) * -0.00555555555556 - +in(i+-3,j+2) * -0.00555555555556 - +in(i+-2,j+-3) * -0.00555555555556 - +in(i+-2,j+-2) * -0.0416666666667 - +in(i+-2,j+-1) * -0.0138888888889 - +in(i+-2,j+0) * -0.0138888888889 - +in(i+-2,j+1) * -0.0138888888889 - +in(i+-2,j+3) * 0.00555555555556 - +in(i+-1,j+-3) * -0.00555555555556 - +in(i+-1,j+-2) * -0.0138888888889 - +in(i+-1,j+-1) * -0.0833333333333 - +in(i+-1,j+0) * -0.0833333333333 - +in(i+-1,j+2) * 0.0138888888889 - +in(i+-1,j+3) * 0.00555555555556 - +in(i+0,j+-3) * -0.00555555555556 - +in(i+0,j+-2) * -0.0138888888889 - +in(i+0,j+-1) * -0.0833333333333 - +in(i+0,j+1) * 0.0833333333333 - +in(i+0,j+2) * 0.0138888888889 - +in(i+0,j+3) * 0.00555555555556 - +in(i+1,j+-3) * -0.00555555555556 - +in(i+1,j+-2) * -0.0138888888889 - +in(i+1,j+0) * 0.0833333333333 + out(i,j) += +in(i-3,j-3) * -0.0277777777778 + +in(i-2,j-3) * -0.00555555555556 + +in(i-1,j-3) * -0.00555555555556 + +in(i,j-3) * -0.00555555555556 + +in(i+1,j-3) * -0.00555555555556 + +in(i+2,j-3) * -0.00555555555556 + +in(i-3,j-2) * -0.00555555555556 + +in(i-2,j-2) * -0.0416666666667 + +in(i-1,j-2) * -0.0138888888889 + +in(i,j-2) * -0.0138888888889 + +in(i+1,j-2) * -0.0138888888889 + +in(i+3,j-2) * 0.00555555555556 + +in(i-3,j-1) * -0.00555555555556 + +in(i-2,j-1) * -0.0138888888889 + +in(i-1,j-1) * -0.0833333333333 + +in(i,j-1) * -0.0833333333333 + +in(i+2,j-1) * 0.0138888888889 + +in(i+3,j-1) * 0.00555555555556 + +in(i-3,j) * -0.00555555555556 + +in(i-2,j) * -0.0138888888889 + +in(i-1,j) * -0.0833333333333 + +in(i+1,j) * 0.0833333333333 + +in(i+2,j) * 0.0138888888889 + +in(i+3,j) * 0.00555555555556 + +in(i-3,j+1) * -0.00555555555556 + +in(i-2,j+1) * -0.0138888888889 + +in(i,j+1) * 0.0833333333333 +in(i+1,j+1) * 0.0833333333333 - +in(i+1,j+2) * 0.0138888888889 - +in(i+1,j+3) * 0.00555555555556 - +in(i+2,j+-3) * -0.00555555555556 - +in(i+2,j+-1) * 0.0138888888889 - +in(i+2,j+0) * 0.0138888888889 +in(i+2,j+1) * 0.0138888888889 - +in(i+2,j+2) * 0.0416666666667 - +in(i+2,j+3) * 0.00555555555556 - +in(i+3,j+-2) * 0.00555555555556 - +in(i+3,j+-1) * 0.00555555555556 - +in(i+3,j+0) * 0.00555555555556 +in(i+3,j+1) * 0.00555555555556 + +in(i-3,j+2) * -0.00555555555556 + +in(i-1,j+2) * 0.0138888888889 + +in(i,j+2) * 0.0138888888889 + +in(i+1,j+2) * 0.0138888888889 + +in(i+2,j+2) * 0.0416666666667 +in(i+3,j+2) * 0.00555555555556 + +in(i-2,j+3) * 0.00555555555556 + +in(i-1,j+3) * 0.00555555555556 + +in(i,j+3) * 0.00555555555556 + +in(i+1,j+3) * 0.00555555555556 + +in(i+2,j+3) * 0.00555555555556 +in(i+3,j+3) * 0.0277777777778 ; }); @@ -193,77 +193,77 @@ void grid4(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(4,n-4); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-4,j+-4) * -0.015625 - +in(i+-4,j+-3) * -0.00223214285714 - +in(i+-4,j+-2) * -0.00223214285714 - +in(i+-4,j+-1) * -0.00223214285714 - +in(i+-4,j+0) * -0.00223214285714 - +in(i+-4,j+1) * -0.00223214285714 - +in(i+-4,j+2) * -0.00223214285714 - +in(i+-4,j+3) * -0.00223214285714 - +in(i+-3,j+-4) * -0.00223214285714 - +in(i+-3,j+-3) * -0.0208333333333 - +in(i+-3,j+-2) * -0.00416666666667 - +in(i+-3,j+-1) * -0.00416666666667 - +in(i+-3,j+0) * -0.00416666666667 - +in(i+-3,j+1) * -0.00416666666667 - +in(i+-3,j+2) * -0.00416666666667 - +in(i+-3,j+4) * 0.00223214285714 - +in(i+-2,j+-4) * -0.00223214285714 - +in(i+-2,j+-3) * -0.00416666666667 - +in(i+-2,j+-2) * -0.03125 - +in(i+-2,j+-1) * -0.0104166666667 - +in(i+-2,j+0) * -0.0104166666667 - +in(i+-2,j+1) * -0.0104166666667 - +in(i+-2,j+3) * 0.00416666666667 - +in(i+-2,j+4) * 0.00223214285714 - +in(i+-1,j+-4) * -0.00223214285714 - +in(i+-1,j+-3) * -0.00416666666667 - +in(i+-1,j+-2) * -0.0104166666667 - +in(i+-1,j+-1) * -0.0625 - +in(i+-1,j+0) * -0.0625 - +in(i+-1,j+2) * 0.0104166666667 - +in(i+-1,j+3) * 0.00416666666667 - +in(i+-1,j+4) * 0.00223214285714 - +in(i+0,j+-4) * -0.00223214285714 - +in(i+0,j+-3) * -0.00416666666667 - +in(i+0,j+-2) * -0.0104166666667 - +in(i+0,j+-1) * -0.0625 - +in(i+0,j+1) * 0.0625 - +in(i+0,j+2) * 0.0104166666667 - +in(i+0,j+3) * 0.00416666666667 - +in(i+0,j+4) * 0.00223214285714 - +in(i+1,j+-4) * -0.00223214285714 - +in(i+1,j+-3) * -0.00416666666667 - +in(i+1,j+-2) * -0.0104166666667 - +in(i+1,j+0) * 0.0625 + out(i,j) += +in(i-4,j-4) * -0.015625 + +in(i-3,j-4) * -0.00223214285714 + +in(i-2,j-4) * -0.00223214285714 + +in(i-1,j-4) * -0.00223214285714 + +in(i,j-4) * -0.00223214285714 + +in(i+1,j-4) * -0.00223214285714 + +in(i+2,j-4) * -0.00223214285714 + +in(i+3,j-4) * -0.00223214285714 + +in(i-4,j-3) * -0.00223214285714 + +in(i-3,j-3) * -0.0208333333333 + +in(i-2,j-3) * -0.00416666666667 + +in(i-1,j-3) * -0.00416666666667 + +in(i,j-3) * -0.00416666666667 + +in(i+1,j-3) * -0.00416666666667 + +in(i+2,j-3) * -0.00416666666667 + +in(i+4,j-3) * 0.00223214285714 + +in(i-4,j-2) * -0.00223214285714 + +in(i-3,j-2) * -0.00416666666667 + +in(i-2,j-2) * -0.03125 + +in(i-1,j-2) * -0.0104166666667 + +in(i,j-2) * -0.0104166666667 + +in(i+1,j-2) * -0.0104166666667 + +in(i+3,j-2) * 0.00416666666667 + +in(i+4,j-2) * 0.00223214285714 + +in(i-4,j-1) * -0.00223214285714 + +in(i-3,j-1) * -0.00416666666667 + +in(i-2,j-1) * -0.0104166666667 + +in(i-1,j-1) * -0.0625 + +in(i,j-1) * -0.0625 + +in(i+2,j-1) * 0.0104166666667 + +in(i+3,j-1) * 0.00416666666667 + +in(i+4,j-1) * 0.00223214285714 + +in(i-4,j) * -0.00223214285714 + +in(i-3,j) * -0.00416666666667 + +in(i-2,j) * -0.0104166666667 + +in(i-1,j) * -0.0625 + +in(i+1,j) * 0.0625 + +in(i+2,j) * 0.0104166666667 + +in(i+3,j) * 0.00416666666667 + +in(i+4,j) * 0.00223214285714 + +in(i-4,j+1) * -0.00223214285714 + +in(i-3,j+1) * -0.00416666666667 + +in(i-2,j+1) * -0.0104166666667 + +in(i,j+1) * 0.0625 +in(i+1,j+1) * 0.0625 - +in(i+1,j+2) * 0.0104166666667 - +in(i+1,j+3) * 0.00416666666667 - +in(i+1,j+4) * 0.00223214285714 - +in(i+2,j+-4) * -0.00223214285714 - +in(i+2,j+-3) * -0.00416666666667 - +in(i+2,j+-1) * 0.0104166666667 - +in(i+2,j+0) * 0.0104166666667 +in(i+2,j+1) * 0.0104166666667 - +in(i+2,j+2) * 0.03125 - +in(i+2,j+3) * 0.00416666666667 - +in(i+2,j+4) * 0.00223214285714 - +in(i+3,j+-4) * -0.00223214285714 - +in(i+3,j+-2) * 0.00416666666667 - +in(i+3,j+-1) * 0.00416666666667 - +in(i+3,j+0) * 0.00416666666667 +in(i+3,j+1) * 0.00416666666667 - +in(i+3,j+2) * 0.00416666666667 - +in(i+3,j+3) * 0.0208333333333 - +in(i+3,j+4) * 0.00223214285714 - +in(i+4,j+-3) * 0.00223214285714 - +in(i+4,j+-2) * 0.00223214285714 - +in(i+4,j+-1) * 0.00223214285714 - +in(i+4,j+0) * 0.00223214285714 +in(i+4,j+1) * 0.00223214285714 + +in(i-4,j+2) * -0.00223214285714 + +in(i-3,j+2) * -0.00416666666667 + +in(i-1,j+2) * 0.0104166666667 + +in(i,j+2) * 0.0104166666667 + +in(i+1,j+2) * 0.0104166666667 + +in(i+2,j+2) * 0.03125 + +in(i+3,j+2) * 0.00416666666667 +in(i+4,j+2) * 0.00223214285714 + +in(i-4,j+3) * -0.00223214285714 + +in(i-2,j+3) * 0.00416666666667 + +in(i-1,j+3) * 0.00416666666667 + +in(i,j+3) * 0.00416666666667 + +in(i+1,j+3) * 0.00416666666667 + +in(i+2,j+3) * 0.00416666666667 + +in(i+3,j+3) * 0.0208333333333 +in(i+4,j+3) * 0.00223214285714 + +in(i-3,j+4) * 0.00223214285714 + +in(i-2,j+4) * 0.00223214285714 + +in(i-1,j+4) * 0.00223214285714 + +in(i,j+4) * 0.00223214285714 + +in(i+1,j+4) * 0.00223214285714 + +in(i+2,j+4) * 0.00223214285714 + +in(i+3,j+4) * 0.00223214285714 +in(i+4,j+4) * 0.015625 ; }); @@ -273,115 +273,115 @@ void grid5(const int n, const int t, matrix & in, matrix & out) { RAJA::RangeSegment inner1(5,n-5); auto inner2 = RAJA::make_tuple(inner1, inner1); RAJA::kernel(inner2, [=](int i, int j) { - out(i,j) += +in(i+-5,j+-5) * -0.01 - +in(i+-5,j+-4) * -0.00111111111111 - +in(i+-5,j+-3) * -0.00111111111111 - +in(i+-5,j+-2) * -0.00111111111111 - +in(i+-5,j+-1) * -0.00111111111111 - +in(i+-5,j+0) * -0.00111111111111 - +in(i+-5,j+1) * -0.00111111111111 - +in(i+-5,j+2) * -0.00111111111111 - +in(i+-5,j+3) * -0.00111111111111 - +in(i+-5,j+4) * -0.00111111111111 - +in(i+-4,j+-5) * -0.00111111111111 - +in(i+-4,j+-4) * -0.0125 - +in(i+-4,j+-3) * -0.00178571428571 - +in(i+-4,j+-2) * -0.00178571428571 - +in(i+-4,j+-1) * -0.00178571428571 - +in(i+-4,j+0) * -0.00178571428571 - +in(i+-4,j+1) * -0.00178571428571 - +in(i+-4,j+2) * -0.00178571428571 - +in(i+-4,j+3) * -0.00178571428571 - +in(i+-4,j+5) * 0.00111111111111 - +in(i+-3,j+-5) * -0.00111111111111 - +in(i+-3,j+-4) * -0.00178571428571 - +in(i+-3,j+-3) * -0.0166666666667 - +in(i+-3,j+-2) * -0.00333333333333 - +in(i+-3,j+-1) * -0.00333333333333 - +in(i+-3,j+0) * -0.00333333333333 - +in(i+-3,j+1) * -0.00333333333333 - +in(i+-3,j+2) * -0.00333333333333 - +in(i+-3,j+4) * 0.00178571428571 - +in(i+-3,j+5) * 0.00111111111111 - +in(i+-2,j+-5) * -0.00111111111111 - +in(i+-2,j+-4) * -0.00178571428571 - +in(i+-2,j+-3) * -0.00333333333333 - +in(i+-2,j+-2) * -0.025 - +in(i+-2,j+-1) * -0.00833333333333 - +in(i+-2,j+0) * -0.00833333333333 - +in(i+-2,j+1) * -0.00833333333333 - +in(i+-2,j+3) * 0.00333333333333 - +in(i+-2,j+4) * 0.00178571428571 - +in(i+-2,j+5) * 0.00111111111111 - +in(i+-1,j+-5) * -0.00111111111111 - +in(i+-1,j+-4) * -0.00178571428571 - +in(i+-1,j+-3) * -0.00333333333333 - +in(i+-1,j+-2) * -0.00833333333333 - +in(i+-1,j+-1) * -0.05 - +in(i+-1,j+0) * -0.05 - +in(i+-1,j+2) * 0.00833333333333 - +in(i+-1,j+3) * 0.00333333333333 - +in(i+-1,j+4) * 0.00178571428571 - +in(i+-1,j+5) * 0.00111111111111 - +in(i+0,j+-5) * -0.00111111111111 - +in(i+0,j+-4) * -0.00178571428571 - +in(i+0,j+-3) * -0.00333333333333 - +in(i+0,j+-2) * -0.00833333333333 - +in(i+0,j+-1) * -0.05 - +in(i+0,j+1) * 0.05 - +in(i+0,j+2) * 0.00833333333333 - +in(i+0,j+3) * 0.00333333333333 - +in(i+0,j+4) * 0.00178571428571 - +in(i+0,j+5) * 0.00111111111111 - +in(i+1,j+-5) * -0.00111111111111 - +in(i+1,j+-4) * -0.00178571428571 - +in(i+1,j+-3) * -0.00333333333333 - +in(i+1,j+-2) * -0.00833333333333 - +in(i+1,j+0) * 0.05 + out(i,j) += +in(i-5,j-5) * -0.01 + +in(i-4,j-5) * -0.00111111111111 + +in(i-3,j-5) * -0.00111111111111 + +in(i-2,j-5) * -0.00111111111111 + +in(i-1,j-5) * -0.00111111111111 + +in(i,j-5) * -0.00111111111111 + +in(i+1,j-5) * -0.00111111111111 + +in(i+2,j-5) * -0.00111111111111 + +in(i+3,j-5) * -0.00111111111111 + +in(i+4,j-5) * -0.00111111111111 + +in(i-5,j-4) * -0.00111111111111 + +in(i-4,j-4) * -0.0125 + +in(i-3,j-4) * -0.00178571428571 + +in(i-2,j-4) * -0.00178571428571 + +in(i-1,j-4) * -0.00178571428571 + +in(i,j-4) * -0.00178571428571 + +in(i+1,j-4) * -0.00178571428571 + +in(i+2,j-4) * -0.00178571428571 + +in(i+3,j-4) * -0.00178571428571 + +in(i+5,j-4) * 0.00111111111111 + +in(i-5,j-3) * -0.00111111111111 + +in(i-4,j-3) * -0.00178571428571 + +in(i-3,j-3) * -0.0166666666667 + +in(i-2,j-3) * -0.00333333333333 + +in(i-1,j-3) * -0.00333333333333 + +in(i,j-3) * -0.00333333333333 + +in(i+1,j-3) * -0.00333333333333 + +in(i+2,j-3) * -0.00333333333333 + +in(i+4,j-3) * 0.00178571428571 + +in(i+5,j-3) * 0.00111111111111 + +in(i-5,j-2) * -0.00111111111111 + +in(i-4,j-2) * -0.00178571428571 + +in(i-3,j-2) * -0.00333333333333 + +in(i-2,j-2) * -0.025 + +in(i-1,j-2) * -0.00833333333333 + +in(i,j-2) * -0.00833333333333 + +in(i+1,j-2) * -0.00833333333333 + +in(i+3,j-2) * 0.00333333333333 + +in(i+4,j-2) * 0.00178571428571 + +in(i+5,j-2) * 0.00111111111111 + +in(i-5,j-1) * -0.00111111111111 + +in(i-4,j-1) * -0.00178571428571 + +in(i-3,j-1) * -0.00333333333333 + +in(i-2,j-1) * -0.00833333333333 + +in(i-1,j-1) * -0.05 + +in(i,j-1) * -0.05 + +in(i+2,j-1) * 0.00833333333333 + +in(i+3,j-1) * 0.00333333333333 + +in(i+4,j-1) * 0.00178571428571 + +in(i+5,j-1) * 0.00111111111111 + +in(i-5,j) * -0.00111111111111 + +in(i-4,j) * -0.00178571428571 + +in(i-3,j) * -0.00333333333333 + +in(i-2,j) * -0.00833333333333 + +in(i-1,j) * -0.05 + +in(i+1,j) * 0.05 + +in(i+2,j) * 0.00833333333333 + +in(i+3,j) * 0.00333333333333 + +in(i+4,j) * 0.00178571428571 + +in(i+5,j) * 0.00111111111111 + +in(i-5,j+1) * -0.00111111111111 + +in(i-4,j+1) * -0.00178571428571 + +in(i-3,j+1) * -0.00333333333333 + +in(i-2,j+1) * -0.00833333333333 + +in(i,j+1) * 0.05 +in(i+1,j+1) * 0.05 - +in(i+1,j+2) * 0.00833333333333 - +in(i+1,j+3) * 0.00333333333333 - +in(i+1,j+4) * 0.00178571428571 - +in(i+1,j+5) * 0.00111111111111 - +in(i+2,j+-5) * -0.00111111111111 - +in(i+2,j+-4) * -0.00178571428571 - +in(i+2,j+-3) * -0.00333333333333 - +in(i+2,j+-1) * 0.00833333333333 - +in(i+2,j+0) * 0.00833333333333 +in(i+2,j+1) * 0.00833333333333 - +in(i+2,j+2) * 0.025 - +in(i+2,j+3) * 0.00333333333333 - +in(i+2,j+4) * 0.00178571428571 - +in(i+2,j+5) * 0.00111111111111 - +in(i+3,j+-5) * -0.00111111111111 - +in(i+3,j+-4) * -0.00178571428571 - +in(i+3,j+-2) * 0.00333333333333 - +in(i+3,j+-1) * 0.00333333333333 - +in(i+3,j+0) * 0.00333333333333 +in(i+3,j+1) * 0.00333333333333 - +in(i+3,j+2) * 0.00333333333333 - +in(i+3,j+3) * 0.0166666666667 - +in(i+3,j+4) * 0.00178571428571 - +in(i+3,j+5) * 0.00111111111111 - +in(i+4,j+-5) * -0.00111111111111 - +in(i+4,j+-3) * 0.00178571428571 - +in(i+4,j+-2) * 0.00178571428571 - +in(i+4,j+-1) * 0.00178571428571 - +in(i+4,j+0) * 0.00178571428571 +in(i+4,j+1) * 0.00178571428571 - +in(i+4,j+2) * 0.00178571428571 - +in(i+4,j+3) * 0.00178571428571 - +in(i+4,j+4) * 0.0125 - +in(i+4,j+5) * 0.00111111111111 - +in(i+5,j+-4) * 0.00111111111111 - +in(i+5,j+-3) * 0.00111111111111 - +in(i+5,j+-2) * 0.00111111111111 - +in(i+5,j+-1) * 0.00111111111111 - +in(i+5,j+0) * 0.00111111111111 +in(i+5,j+1) * 0.00111111111111 + +in(i-5,j+2) * -0.00111111111111 + +in(i-4,j+2) * -0.00178571428571 + +in(i-3,j+2) * -0.00333333333333 + +in(i-1,j+2) * 0.00833333333333 + +in(i,j+2) * 0.00833333333333 + +in(i+1,j+2) * 0.00833333333333 + +in(i+2,j+2) * 0.025 + +in(i+3,j+2) * 0.00333333333333 + +in(i+4,j+2) * 0.00178571428571 +in(i+5,j+2) * 0.00111111111111 + +in(i-5,j+3) * -0.00111111111111 + +in(i-4,j+3) * -0.00178571428571 + +in(i-2,j+3) * 0.00333333333333 + +in(i-1,j+3) * 0.00333333333333 + +in(i,j+3) * 0.00333333333333 + +in(i+1,j+3) * 0.00333333333333 + +in(i+2,j+3) * 0.00333333333333 + +in(i+3,j+3) * 0.0166666666667 + +in(i+4,j+3) * 0.00178571428571 +in(i+5,j+3) * 0.00111111111111 + +in(i-5,j+4) * -0.00111111111111 + +in(i-3,j+4) * 0.00178571428571 + +in(i-2,j+4) * 0.00178571428571 + +in(i-1,j+4) * 0.00178571428571 + +in(i,j+4) * 0.00178571428571 + +in(i+1,j+4) * 0.00178571428571 + +in(i+2,j+4) * 0.00178571428571 + +in(i+3,j+4) * 0.00178571428571 + +in(i+4,j+4) * 0.0125 +in(i+5,j+4) * 0.00111111111111 + +in(i-4,j+5) * 0.00111111111111 + +in(i-3,j+5) * 0.00111111111111 + +in(i-2,j+5) * 0.00111111111111 + +in(i-1,j+5) * 0.00111111111111 + +in(i,j+5) * 0.00111111111111 + +in(i+1,j+5) * 0.00111111111111 + +in(i+2,j+5) * 0.00111111111111 + +in(i+3,j+5) * 0.00111111111111 + +in(i+4,j+5) * 0.00111111111111 +in(i+5,j+5) * 0.01 ; }); diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp index c85964181..f1ecb729e 100644 --- a/Cxx11/stencil_rangefor.hpp +++ b/Cxx11/stencil_rangefor.hpp @@ -3,10 +3,10 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector & in, std::vector> common/make.defs + echo "SYCLCXX=${PRK_CXX} -pthread -std=c++17" >> common/make.defs else - echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++14" >> common/make.defs + echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs fi echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl From 1144a490a3bcde89e5d9066343e5e1acf8bba9fd Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 15 Sep 2018 12:32:06 -0700 Subject: [PATCH 114/245] Fix Travis issues (#365) * fix Julia syntax issue "1./" is a syntax error now. change to "1.0/" * fix issue with array += scalar --- JULIA/stencil.jl | 52 ++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/JULIA/stencil.jl b/JULIA/stencil.jl index 1b5e18bbb..3b53cf7d5 100644 --- a/JULIA/stencil.jl +++ b/JULIA/stencil.jl @@ -61,6 +61,21 @@ # # ******************************************************************* +function do_add(A, n) + for i=1:n + for j=1:n + A[i,j] += 1.0 + end + end +end + +function do_init(A, n) + for i=1:n + for j=1:n + A[i,j] = i+j-2 + end + end +end function do_star(A, W, B, r, n) for j=r:n-r-1 @@ -151,38 +166,37 @@ function main() if pattern == "star" stencil_size = 4*r+1 for i=1:r - W[r+1,r+i+1] = +1./(2*i*r) - W[r+i+1,r+1] = +1./(2*i*r) - W[r+1,r-i+1] = -1./(2*i*r) - W[r-i+1,r+1] = -1./(2*i*r) + W[r+1,r+i+1] = 1.0/(2*i*r) + W[r+i+1,r+1] = 1.0/(2*i*r) + W[r+1,r-i+1] = -1.0/(2*i*r) + W[r-i+1,r+1] = -1.0/(2*i*r) end else stencil_size = (2*r+1)^2 for j=1:r for i=-j+1:j-1 - W[r+i+1,r+j+1] = +1./(4*j*(2*j-1)*r) - W[r+i+1,r-j+1] = -1./(4*j*(2*j-1)*r) - W[r+j+1,r+i+1] = +1./(4*j*(2*j-1)*r) - W[r-j+1,r+i+1] = -1./(4*j*(2*j-1)*r) + W[r+i+1,r+j+1] = 1.0/(4*j*(2*j-1)*r) + W[r+i+1,r-j+1] = -1.0/(4*j*(2*j-1)*r) + W[r+j+1,r+i+1] = 1.0/(4*j*(2*j-1)*r) + W[r-j+1,r+i+1] = -1.0/(4*j*(2*j-1)*r) end - W[r+j+1,r+j+1] = +1./(4*j*r) - W[r-j+1,r-j+1] = -1./(4*j*r) + W[r+j+1,r+j+1] = 1.0/(4*j*r) + W[r-j+1,r-j+1] = -1.0/(4*j*r) end end - A = zeros(Float64,n,n) - for i=1:n - for j=1:n - A[i,j] = i+j-2 - end - end - B = zeros(Float64,n,n) - + precompile(do_init, (Array{Float64,2}, Int64)) if pattern == "star" precompile(do_star, (Array{Float64,2}, Array{Float64,2}, Array{Float64,2}, Int64, Int64)) else precompile(do_stencil, (Array{Float64,2}, Array{Float64,2}, Array{Float64,2}, Int64, Int64)) end + precompile(do_add, (Array{Float64,2}, Int64)) + + A = zeros(Float64,n,n) + B = zeros(Float64,n,n) + + do_init(A, n) t0 = time_ns() @@ -192,7 +206,7 @@ function main() else do_stencil(A, W, B, r, n) end - A += 1.0 + do_add(A, n) end t1 = time_ns() From fba9268665d58012cda21a4205a801dc08552925 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 11 Jan 2019 17:25:44 -0800 Subject: [PATCH 115/245] update CMake (#366) * and they say autotools is brittle... * update SOS version * disable SHMEM on Mac altogether * disable RAJA * disable RAJA * ASLR breaks SOS --- .travis.yml | 4 ++++ travis/build-run-prk.sh | 2 ++ travis/install-cmake.sh | 7 ++----- travis/install-deps.sh | 5 +++-- travis/install-sandia-openshmem.sh | 4 ++-- 5 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index ac0a9f07c..140751724 100644 --- a/.travis.yml +++ b/.travis.yml @@ -135,6 +135,10 @@ matrix: # Mac issue with thread_t (see https://github.com/humairakamal/fgmpi/pull/1) - os: osx env: PRK_TARGET=allfgmpi + # SOS@OFI has not worked on MacOS in a while :-( + - os: osx + compiler: clang + env: PRK_TARGET=allshmem allow_failures: # Travis trusty breaks this - os: linux diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index ab7363dee..de135e01f 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -591,6 +591,7 @@ case "$PRK_TARGET" in ;; esac # RAJA + if [ 0 = 1 ] ; then make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ p2p-raja stencil-raja transpose-raja nstream-raja # New (Views) @@ -620,6 +621,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/stencil-raja 10 200 20 $s $r done done + fi # Kokkos make -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos $PRK_TARGET_PATH/stencil-kokkos 10 1000 diff --git a/travis/install-cmake.sh b/travis/install-cmake.sh index 0aa5f8daf..f807d8093 100755 --- a/travis/install-cmake.sh +++ b/travis/install-cmake.sh @@ -18,9 +18,6 @@ case "$os" in echo "Linux" if [ ! -d "$TRAVIS_ROOT/cmake" ]; then mkdir -p $TRAVIS_ROOT/cmake - # DEBUG - ls -l $TRAVIS_ROOT - ls -l $TRAVIS_ROOT/cmake # from source #wget --no-check-certificate -q https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz #tar -C $TRAVIS_ROOT -xzf cmake-3.4.1.tar.gz @@ -30,8 +27,8 @@ case "$os" in #make -j4 && make install # from binary cd $TRAVIS_ROOT - wget --no-check-certificate -q https://cmake.org/files/v3.4/cmake-3.4.1-Linux-x86_64.sh - sh ./cmake-3.4.1-Linux-x86_64.sh --prefix=$TRAVIS_ROOT/cmake --skip-license --exclude-subdir + wget --no-check-certificate -q https://github.com/Kitware/CMake/releases/download/v3.13.2/cmake-3.13.2-Linux-x86_64.sh + sh ./cmake-3.13.2-Linux-x86_64.sh --prefix=$TRAVIS_ROOT/cmake --skip-license --exclude-subdir else echo "CMake installed..." find $TRAVIS_ROOT/cmake -name cmake diff --git a/travis/install-deps.sh b/travis/install-deps.sh index 89243cd93..cb79e96e9 100755 --- a/travis/install-deps.sh +++ b/travis/install-deps.sh @@ -66,9 +66,9 @@ case "$PRK_TARGET" in if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then sh ./travis/install-boost.sh $TRAVIS_ROOT fi - # CMake 3.3 or higher is required. You are running version 2.8.7. + # CMake 3.10 or higher is required. sh ./travis/install-cmake.sh $TRAVIS_ROOT - sh ./travis/install-raja.sh $TRAVIS_ROOT + #sh ./travis/install-raja.sh $TRAVIS_ROOT sh ./travis/install-kokkos.sh $TRAVIS_ROOT #sh ./travis/install-occa.sh $TRAVIS_ROOT sh ./travis/install-sycl.sh $TRAVIS_ROOT @@ -94,6 +94,7 @@ case "$PRK_TARGET" in brew link --overwrite gcc || true fi if [ "${CC}" = "gcc" ] ; then + sh ./travis/install-cmake.sh $TRAVIS_ROOT sh ./travis/install-opencoarrays.sh $TRAVIS_ROOT fi ;; diff --git a/travis/install-sandia-openshmem.sh b/travis/install-sandia-openshmem.sh index 308c32d3d..0d046d7ef 100755 --- a/travis/install-sandia-openshmem.sh +++ b/travis/install-sandia-openshmem.sh @@ -16,7 +16,7 @@ if [ ! -d "$SHMEM_ROOT" ]; then # HEAD #git clone --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git sandia-shmem #cd sandia-shmem - VERSION=1.4.0 + VERSION=1.4.2 #git clone -b v$VERSION --depth 1 https://github.com/Sandia-OpenSHMEM/SOS.git SOS-$VERSION wget https://github.com/Sandia-OpenSHMEM/SOS/archive/v$VERSION.tar.gz tar -xzf v$VERSION.tar.gz @@ -28,9 +28,9 @@ if [ ! -d "$SHMEM_ROOT" ]; then ../configure --with-libfabric=$TRAVIS_ROOT/libfabric \ --disable-fortran \ --enable-error-checking \ - --enable-remote-virtual-addressing \ --enable-pmi-simple \ --prefix=$SHMEM_ROOT + #--enable-remote-virtual-addressing \ make make check | true make install From f626ce7d51493358c0788f6f72340f8067b3484a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 9 Feb 2019 09:18:06 -0800 Subject: [PATCH 116/245] trivial fixes (#367) * add no-warning flags for cl.hpp * format fix --- FORTRAN/dgemm.f90 | 2 +- common/make.defs.gcc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90 index 5f678c981..a68eff104 100644 --- a/FORTRAN/dgemm.f90 +++ b/FORTRAN/dgemm.f90 @@ -173,7 +173,7 @@ program main if (command_argument_count().lt.2) then write(*,'(a17,i1)') 'argument count = ', command_argument_count() - write(*,'(a62)') 'Usage: ./dgemm-pretty <# iterations> []' + write(*,'(a66)') 'Usage: ./dgemm-pretty <# iterations> []' stop 1 endif diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 732083da1..48e7cc115 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -42,6 +42,7 @@ OPENCLFLAG=-framework OpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations # # SYCL flags # From 631ba6f3107ecf03ab065c069cdce94463bc4f44 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 9 Feb 2019 09:18:22 -0800 Subject: [PATCH 117/245] fix CBLAS on MacOS (#368) * add no-warning flags for cl.hpp * fix issues with dgemm-cblas on Mac --- Cxx11/dgemm-cblas.cc | 6 +++++- Cxx11/prk_util.h | 6 ++++++ common/make.defs.gcc | 2 +- common/make.defs.intel | 1 + common/make.defs.llvm | 2 +- common/make.defs.pgi | 4 ++++ 6 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index 61a9292fb..24ae52bae 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -73,6 +73,10 @@ #include #endif +#ifdef _OPENMP +#include +#endif + #ifdef PRK_DEBUG #include void prk_dgemm_loops(const int order, @@ -146,7 +150,7 @@ void prk_dgemm(const int order, const int batches, const double beta = 1.0; const int group_count = 1; - const int group_size[group_count] = { batches }; + PRK_UNUSED const int group_size[group_count] = { batches }; const CBLAS_TRANSPOSE transa_array[group_count] = { CblasNoTrans }; const CBLAS_TRANSPOSE transb_array[group_count] = { CblasNoTrans }; diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 321f91c8c..b25dccdf6 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -76,6 +76,12 @@ #define RESTRICT __restrict__ +#if (defined(__cplusplus) && (__cplusplus >= 201703L)) +#define PRK_UNUSED [[maybe_unused]] +#else +#define PRK_UNUSED +#endif + namespace prk { template diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 48e7cc115..e1d68dd5a 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -85,7 +85,7 @@ THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # # CBLAS for C++ DGEMM # -CBLASFLAG=-DACCELERATE -framework Accelerate +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # # CUDA flags # diff --git a/common/make.defs.intel b/common/make.defs.intel index 00a781cac..bba53d1bb 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -81,6 +81,7 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} # # CBLAS for C++ DGEMM # +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions CBLASFLAG=-DMKL -mkl # # CUDA flags diff --git a/common/make.defs.llvm b/common/make.defs.llvm index f4a54c4f8..a5c9010d4 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -108,7 +108,7 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} # # CBLAS for C++ DGEMM # -CBLASFLAG=-DACCELERATE -framework Accelerate +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # # CUDA flags # diff --git a/common/make.defs.pgi b/common/make.defs.pgi index ddaf99a69..1205afff2 100644 --- a/common/make.defs.pgi +++ b/common/make.defs.pgi @@ -40,6 +40,10 @@ KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPE RAJADIR=./raja RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} # +# CBLAS for C++ DGEMM +# +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +# # CUDA flags # # Linux w/ NVIDIA CUDA From 236947e8f385f1f2a0d3c0cb1c516d227256a146 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 20 Feb 2019 22:19:49 -0800 Subject: [PATCH 118/245] check system return code in stencil-opencl (#371) * silence GCC warning about ignored return code --- Cxx11/stencil-opencl.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc index 89a261cc9..8db6adfa4 100644 --- a/Cxx11/stencil-opencl.cc +++ b/Cxx11/stencil-opencl.cc @@ -83,7 +83,10 @@ void run(cl::Context context, int iterations, int n, int radius, bool star) std::string command("./generate-opencl-stencil.py "); command += ( star ? "star " : "grid " ); command += std::to_string(radius); - std::system( command.c_str() ); + int rc = std::system( command.c_str() ); + if (rc != 0) { + std::cerr << command.c_str() << " returned " << rc << std::endl; + } } source = prk::opencl::loadProgram(filename1); cl::Program program1(context, source, true); From 4195262ff1d926bba638e2791773dfb842a2d2b6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 20 Feb 2019 22:20:15 -0800 Subject: [PATCH 119/245] support FreeBSD (#370) * support FreeBSD example * add README for FreeBSD [ci skip] * modify Travis run script to work locally under FreeBSD --- .gitignore | 2 + common/README.freebsd | 26 ++++++++ common/make.defs.freebsd | 102 +++++++++++++++++++++++++++++ travis/build-run-prk.sh | 138 +++++++++++++++++++++++---------------- 4 files changed, 210 insertions(+), 58 deletions(-) create mode 100644 common/README.freebsd create mode 100644 common/make.defs.freebsd diff --git a/.gitignore b/.gitignore index d4a60c93f..7ed1c8b8c 100644 --- a/.gitignore +++ b/.gitignore @@ -117,12 +117,14 @@ Cxx11/compute Cxx11/triSYCL Cxx11/occa Cxx11/pstl +Cxx11/parallelstl Cxx11/range-v3 Cxx11/dgemm-vector Cxx11/dgemm-cblas Cxx11/dgemm-cublas Cxx11/p2p-openmp-target Cxx11/p2p-tasks-openmp +Cxx11/p2p-tasks-tbb Cxx11/p2p-vector Cxx11/p2p-vector-doacross-openmp Cxx11/p2p-vector-raja diff --git a/common/README.freebsd b/common/README.freebsd new file mode 100644 index 000000000..8a52f24da --- /dev/null +++ b/common/README.freebsd @@ -0,0 +1,26 @@ +This is a rather terse summary of what is required to build the PRKs on FreeBSD. + +# Necessary Packages + +BSD make isn't GNU make, which the PRK assumes. + +sudo pkg install gmake + +I assume GCC works fine as it does on Linux but I tested LLVM. +OpenMP target is not supported by LLVM 6.0.1 so those compilations will fail. + +sudo pkg install clang flang libpgmath + +## C++ dependencies + +sudo pkg install opencl-2.2_1 +sudo pkg install devel/clinfo devel/ocl-icd lang/beignet lang/pocl +sudo pkg install tbb +sudo pkg install boost-all + +You will need to acquire triSYCL and Intel Parallel STL via GitHub. +One minor issue with triSYCL was addressed by patching triSYCL. +I suspect this issue disappears with LLVM 7.0 but you can look up +the issue with `std::optional` on GitHub if necessary. + +RAJA and Kokkos were not tested. diff --git a/common/make.defs.freebsd b/common/make.defs.freebsd new file mode 100644 index 000000000..745813eef --- /dev/null +++ b/common/make.defs.freebsd @@ -0,0 +1,102 @@ +# +# This file shows the LLVM toolchain options for PRKs using +# OpenMP, MPI and/or Fortran coarrays only. +# +# Base compilers and language options +# +LLVM_ROOT=/usr/local/llvm60 +LLVM_PATH=${LLVM_ROOT}/bin/ +# C99 is required in some implementations. +CC=${LLVM_PATH}clang -std=c11 -pthread +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +FC=/usr/local/bin/flang -Mpreprocess -Mfreeform -I/usr/local/flang/include -lexecinfo +# C++11 may not be required but does no harm here. +CXX=${LLVM_PATH}clang++ -std=c++14 -pthread +# +# Compiler flags +# +# -mtune=native is appropriate for most cases. +# -march=native is appropriate if you want portable binaries. +# +DEFAULT_OPT_FLAGS=-g -O3 +#DEFAULT_OPT_FLAGS+=-mllvm -polly -mllvm -polly-vectorizer=stripmine +# +# If you want to be specific, get the architecture options from: +# ${LLVM_PATH}llc --version +# and then get the CPU/ISA options from (e.g. for x86-64): +# ${LLVM_PATH}llc -march=x86-64 -mcpu=help +# +# These are useful to understand why the compiler does not vectorize loops: +# DEFAULT_OPT_FLAGS+=-Rpass-analysis=loop-vectorize +# DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize +# +# OpenMP flags +# +OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd +OFFLOADFLAG=-fopenmp +#ORNLACCFLAG= # Flang does not support OpenACC +# Mac weirdness +OPENMPFLAG+=-L${LLVM_ROOT}/lib +# BSD weirdness +OPENMPFLAG+=-I${LLVM_ROOT}/lib/clang/6.0.1/include +# +# OpenCL flags +# +OPENCLDIR=/usr/local +OPENCLFLAG=-I${OPENCLDIR}/include -L${OPENCLDIR}/lib -lOpenCL +OPENCLFLAG+=-Wno-deprecated-declarations +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# TBB +# +TBBDIR=/usr/local +TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +# +# Parallel STL, Boost, etc. +# +BOOSTFLAG=-I/usr/local/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +KOKKOSDIR=/opt/kokkos/clang +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl +RAJADIR=/opt/raja/clang +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +#THRUSTDIR=/opt/nvidia/thrust +#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP +# +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} ${DEFAULT_OPT_FLAGS} ${OPENMPFLAG} +SYCLFLAG=-std=c++17 -I${SYCLDIR}/include ${BOOSTFLAG} +# +# CBLAS for C++ DGEMM +# +CBLASFLAG= +# +# CUDA flags +# +# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander +NVCC=/opt/llvm/cocl/bin/cocl +# Linux w/ NVIDIA CUDA +#NVCC=nvcc -arch=sm_50 +CUDAFLAGS=-g -O3 -std=c++11 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# ISPC +# +ISPC=ispc +ISPCFLAG=-O3 --target=host --opt=fast-math +# +# MPI +# +# We assume you have installed an implementation of MPI-3 that is in your path. +MPICC=mpicc diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index de135e01f..194e7ca51 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -12,6 +12,15 @@ if [ -f ~/use-intel-compilers ] ; then export FC=ifort fi +case "$os" in + FreeBSD) + MAKE=gmake + ;; + *) + MAKE=make + ;; +esac + case "$os" in Darwin) # Homebrew should put MPI here... @@ -80,7 +89,7 @@ case "$PRK_TARGET" in allserial) echo "Serial" echo "CC=$CC -std=c99" >> common/make.defs - make $PRK_TARGET + ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=SERIAL $PRK_TARGET_PATH/Synch_p2p/p2p 10 1024 1024 $PRK_TARGET_PATH/Stencil/stencil 10 1000 @@ -139,7 +148,7 @@ case "$PRK_TARGET" in echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs # C11 without external parallelism - make -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop + ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop $PRK_TARGET_PATH/p2p 10 1024 1024 $PRK_TARGET_PATH/p2p 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop 10 1024 @@ -153,7 +162,7 @@ case "$PRK_TARGET" in done # C11 with POSIX or C11 thread parallelism - test POSIX here, C11 at the end. - make -C $PRK_TARGET_PATH transpose-thread + ${MAKE} -C $PRK_TARGET_PATH transpose-thread $PRK_TARGET_PATH/transpose-thread 10 1024 512 # C11 with OpenMP @@ -162,7 +171,7 @@ case "$PRK_TARGET" in g*) # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 $PRK_TARGET_PATH/stencil-openmp 10 1000 @@ -175,7 +184,7 @@ case "$PRK_TARGET" in done # Offload echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs - make -C $PRK_TARGET_PATH target + ${MAKE} -C $PRK_TARGET_PATH target $PRK_TARGET_PATH/stencil-target 10 1000 $PRK_TARGET_PATH/transpose-target 10 1024 32 #echo "Test stencil code generator" @@ -189,7 +198,7 @@ case "$PRK_TARGET" in # Host echo "Skipping Clang since OpenMP support probably missing" #echo "OPENMPFLAG=-fopenmp" >> common/make.defs - #make -C $PRK_TARGET_PATH openmp + #${MAKE} -C $PRK_TARGET_PATH openmp #$PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 #$PRK_TARGET_PATH/stencil-openmp 10 1000 #$PRK_TARGET_PATH/transpose-penmp 10 1024 32 @@ -203,7 +212,7 @@ case "$PRK_TARGET" in ic*) # Host echo "OPENMPFLAG=-qopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 1024 $PRK_TARGET_PATH/stencil-openmp 10 1000 @@ -217,7 +226,7 @@ case "$PRK_TARGET" in # Offload - not supported on MacOS if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then echo "OFFLOADFLAG=-qopenmp -qopenmp-offload=host" >> common/make.defs - make -C $PRK_TARGET_PATH target + ${MAKE} -C $PRK_TARGET_PATH target $PRK_TARGET_PATH/stencil-openmp-target 10 1000 $PRK_TARGET_PATH/transpose-openmp-target 10 1024 32 #echo "Test stencil code generator" @@ -236,7 +245,7 @@ case "$PRK_TARGET" in # C11 with Cilk if [ "${CC}" = "gcc" ] ; then echo "CILKFLAG=-fcilkplus" >> common/make.defs - make -C $PRK_TARGET_PATH stencil-cilk transpose-cilk + ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk $PRK_TARGET_PATH/stencil-cilk 10 1000 $PRK_TARGET_PATH/transpose-cilk 10 1024 32 #echo "Test stencil code generator" @@ -248,12 +257,12 @@ case "$PRK_TARGET" in fi # Use MUSL for GCC+Linux only if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$CC" = "gcc" ] ; then - make -C $PRK_TARGET_PATH clean + ${MAKE} -C $PRK_TARGET_PATH clean ./travis/install-musl.sh ${TRAVIS_ROOT} ${PRK_CC} echo "PRKVERSION=\"'2.16'\"" > common/make.defs echo "CC=${TRAVIS_ROOT}/musl/bin/musl-gcc -static -std=c11 -DUSE_C11_THREADS" >> common/make.defs echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs - make -C $PRK_TARGET_PATH transpose-thread + ${MAKE} -C $PRK_TARGET_PATH transpose-thread $PRK_TARGET_PATH/transpose-thread 10 1024 512 fi @@ -316,12 +325,12 @@ case "$PRK_TARGET" in echo "CXX=${PRK_CXX} -std=c++14 -pthread" >> common/make.defs # C++11 without external parallelism - make -C $PRK_TARGET_PATH transpose-valarray nstream-valarray + ${MAKE} -C $PRK_TARGET_PATH transpose-valarray nstream-valarray $PRK_TARGET_PATH/transpose-valarray 10 1024 32 $PRK_TARGET_PATH/nstream-valarray 10 16777216 32 # C++11 without external parallelism - make -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \ + ${MAKE} -C $PRK_TARGET_PATH p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector \ dgemm-vector sparse-vector $PRK_TARGET_PATH/p2p-vector 10 1024 1024 $PRK_TARGET_PATH/p2p-vector 10 1024 1024 100 100 @@ -343,13 +352,13 @@ case "$PRK_TARGET" in # C++11 with CBLAS if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then echo "CBLASFLAG=-DACCELERATE -framework Accelerate" >> common/make.defs - make -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas + ${MAKE} -C $PRK_TARGET_PATH transpose-cblas dgemm-cblas $PRK_TARGET_PATH/transpose-cblas 10 1024 $PRK_TARGET_PATH/dgemm-cblas 10 400 fi # C++11 native parallelism - make -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async + ${MAKE} -C $PRK_TARGET_PATH transpose-vector-thread transpose-vector-async $PRK_TARGET_PATH/transpose-vector-thread 10 1024 512 32 $PRK_TARGET_PATH/transpose-vector-async 10 1024 512 32 @@ -359,7 +368,7 @@ case "$PRK_TARGET" in gcc) # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 @@ -375,7 +384,7 @@ case "$PRK_TARGET" in done # Offload echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs - make -C $PRK_TARGET_PATH target + ${MAKE} -C $PRK_TARGET_PATH target $PRK_TARGET_PATH/stencil-openmp-target 10 1000 $PRK_TARGET_PATH/transpose-openmp-target 10 1024 32 #echo "Test stencil code generator" @@ -386,7 +395,7 @@ case "$PRK_TARGET" in done # ORNL-ACC echo "ORNLACCFLAG=-fopenacc" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc + ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-ornlacc $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-vector-ornlacc 10 1024 64 ;; @@ -394,7 +403,7 @@ case "$PRK_TARGET" in if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp \ transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 @@ -410,7 +419,7 @@ case "$PRK_TARGET" in done # Offload #echo "OFFLOADFLAG=-foffload=\"-O3 -v\"" >> common/make.defs - #make -C $PRK_TARGET_PATH target + #${MAKE} -C $PRK_TARGET_PATH target #$PRK_TARGET_PATH/stencil-openmp-target 10 1000 #$PRK_TARGET_PATH/transpose-openmp-target 10 1024 32 ##echo "Test stencil code generator" @@ -426,7 +435,7 @@ case "$PRK_TARGET" in icc) # Host echo "OPENMPFLAG=-qopenmp" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \ + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp \ transpose-openmp nstream-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 1024 @@ -442,7 +451,7 @@ case "$PRK_TARGET" in # Offload - not supported on MacOS if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then echo "OFFLOADFLAG=-qopenmp -qopenmp-offload=host" >> common/make.defs - make -C $PRK_TARGET_PATH target + ${MAKE} -C $PRK_TARGET_PATH target $PRK_TARGET_PATH/stencil-openmp-target 10 1000 $PRK_TARGET_PATH/transpose-openmp-target 10 1024 32 #echo "Test stencil code generator" @@ -458,14 +467,20 @@ case "$PRK_TARGET" in ;; esac - # Boost.Compute found after OpenCL, and only available in Travis with MacOS. - echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs - - #echo "RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG}" >> common/make.defs - echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs + # Boost.Compute runs after OpenCL, and only available in Travis with MacOS. + case "$os" in + FreeBSD) + echo "BOOSTFLAG=-DUSE_BOOST -I/usr/local/include" >> common/make.defs + echo "RANGEFLAG=-DUSE_BOOST_IRANGE -I/usr/local/include" >> common/make.defs + ;; + *) + echo "BOOSTFLAG=-DUSE_BOOST" >> common/make.defs + echo "RANGEFLAG=-DUSE_RANGES_TS -I${TRAVIS_ROOT}/range-v3/include" >> common/make.defs + ;; + esac # C++11 with rangefor and Boost.Ranges - make -C $PRK_TARGET_PATH rangefor + ${MAKE} -C $PRK_TARGET_PATH rangefor $PRK_TARGET_PATH/stencil-vector-rangefor 10 1000 $PRK_TARGET_PATH/transpose-vector-rangefor 10 1024 32 $PRK_TARGET_PATH/nstream-vector-rangefor 10 16777216 32 @@ -493,7 +508,7 @@ case "$PRK_TARGET" in export LD_LIBRARY_PATH=${TBBROOT}/lib:${LD_LIBRARY_PATH} ;; esac - make -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb + ${MAKE} -C $PRK_TARGET_PATH p2p-innerloop-vector-tbb p2p-hyperplane-vector-tbb p2p-tasks-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb $PRK_TARGET_PATH/p2p-innerloop-vector-tbb 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-tbb 10 1024 32 @@ -510,7 +525,7 @@ case "$PRK_TARGET" in fi # C++11 with STL - make -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl + ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-stl stencil-vector-stl transpose-vector-stl nstream-vector-stl $PRK_TARGET_PATH/p2p-hyperplane-vector-stl 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-stl 10 1024 32 $PRK_TARGET_PATH/stencil-vector-stl 10 1000 @@ -532,7 +547,7 @@ case "$PRK_TARGET" in else echo "PSTLFLAG=-DUSE_PSTL -fopenmp ${TBBFLAG} -DUSE_INTEL_PSTL -I${TRAVIS_ROOT}/pstl/include ${RANGEFLAG}" >> common/make.defs fi - make -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl + ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-vector-pstl stencil-vector-pstl transpose-vector-pstl nstream-vector-pstl $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl 10 1024 1 $PRK_TARGET_PATH/p2p-hyperplane-vector-pstl 10 1024 32 $PRK_TARGET_PATH/stencil-vector-pstl 10 1000 @@ -549,7 +564,7 @@ case "$PRK_TARGET" in # C++11 with OpenCL if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then echo "OPENCLFLAG=-framework OpenCL" >> common/make.defs - make -C $PRK_TARGET_PATH opencl + ${MAKE} -C $PRK_TARGET_PATH opencl # must run programs in same directory as OpenCL source files... cd $PRK_TARGET_PATH ./stencil-opencl 10 1000 @@ -572,7 +587,7 @@ case "$PRK_TARGET" in # (2) Boost.Compute is not available from APT. # If we ever address 1, we need to enable the Boost.Compute install for Linux. if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then - make -C $PRK_TARGET_PATH nstream-vector-boost-compute + ${MAKE} -C $PRK_TARGET_PATH nstream-vector-boost-compute $PRK_TARGET_PATH/nstream-vector-boost-compute 10 16777216 32 fi @@ -592,7 +607,7 @@ case "$PRK_TARGET" in esac # RAJA if [ 0 = 1 ] ; then - make -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ + ${MAKE} -C $PRK_TARGET_PATH p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ p2p-raja stencil-raja transpose-raja nstream-raja # New (Views) $PRK_TARGET_PATH/p2p-raja 10 1024 1024 @@ -623,7 +638,7 @@ case "$PRK_TARGET" in done fi # Kokkos - make -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos + ${MAKE} -C $PRK_TARGET_PATH stencil-kokkos transpose-kokkos nstream-kokkos $PRK_TARGET_PATH/stencil-kokkos 10 1000 $PRK_TARGET_PATH/transpose-kokkos 10 1024 32 $PRK_TARGET_PATH/nstream-kokkos 10 16777216 32 @@ -638,7 +653,7 @@ case "$PRK_TARGET" in #if [ "${TRAVIS_OS_NAME}" = "linux" ] ; then # echo "OCCADIR=${TRAVIS_ROOT}/occa" >> common/make.defs # export OCCA_CXX=${PRK_CXX} - # make -C $PRK_TARGET_PATH transpose-occa nstream-occa + # ${MAKE} -C $PRK_TARGET_PATH transpose-occa nstream-occa # $PRK_TARGET_PATH/transpose-occa 10 1024 32 # $PRK_TARGET_PATH/nstream-occa 10 16777216 32 #fi @@ -654,7 +669,7 @@ case "$PRK_TARGET" in echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs fi echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs - make -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl + ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl #$PRK_TARGET_PATH/p2p-hyperplane-sycl 10 50 1 # 100 takes too long :-o $PRK_TARGET_PATH/stencil-sycl 10 1000 $PRK_TARGET_PATH/transpose-sycl 10 1024 32 @@ -698,9 +713,16 @@ case "$PRK_TARGET" in echo "COARRAYFLAG=-fcoarray=single" >> common/make.defs ;; clang) - echo "LLVM Fortran is not supported." - exit 9 - echo "FC=flang" >> common/make.defs + case "$os" in + FreeBSD) + echo "FC=flang -Mpreprocess -Mfreeform -I/usr/local/flang/include -lexecinfo" >> common/make.defs + ;; + *) + # untested + echo "FC=flang -Mpreprocess -Mfreeform" >> common/make.defs + ;; + esac + echo "OPENMPFLAG=-fopenmp" >> common/make.defs ;; icc) # -heap-arrays prevents SEGV in transpose-pretty (?) @@ -713,7 +735,7 @@ case "$PRK_TARGET" in esac # Serial - make -C ${PRK_TARGET_PATH} p2p p2p-innerloop stencil transpose nstream dgemm + ${MAKE} -C ${PRK_TARGET_PATH} p2p p2p-innerloop stencil transpose nstream dgemm $PRK_TARGET_PATH/p2p 10 1024 1024 $PRK_TARGET_PATH/p2p-innerloop 10 1024 $PRK_TARGET_PATH/stencil 10 1000 @@ -724,7 +746,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/dgemm 10 400 32 # Pretty - make -C ${PRK_TARGET_PATH} stencil-pretty transpose-pretty nstream-pretty dgemm-pretty + ${MAKE} -C ${PRK_TARGET_PATH} stencil-pretty transpose-pretty nstream-pretty dgemm-pretty #$PRK_TARGET_PATH/p2p-pretty 10 1024 1024 # pretty versions do not support tiling... $PRK_TARGET_PATH/stencil-pretty 10 1000 @@ -733,7 +755,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/dgemm-pretty 10 400 # OpenMP host - make -C ${PRK_TARGET_PATH} p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp \ + ${MAKE} -C ${PRK_TARGET_PATH} p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp \ nstream-openmp dgemm-openmp export OMP_NUM_THREADS=2 $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 @@ -749,7 +771,7 @@ case "$PRK_TARGET" in # Intel Mac does not support OpenMP target or coarrays if [ "${CC}" = "gcc" ] || [ "${TRAVIS_OS_NAME}" = "linux" ] ; then # OpenMP target - make -C ${PRK_TARGET_PATH} stencil-openmp-target transpose-openmp-target nstream-openmp-target + ${MAKE} -C ${PRK_TARGET_PATH} stencil-openmp-target transpose-openmp-target nstream-openmp-target export OMP_NUM_THREADS=2 #$PRK_TARGET_PATH/p2p-openmp-target 10 1024 1024 # most compilers do not support doacross yet $PRK_TARGET_PATH/stencil-openmp-target 10 1000 @@ -758,7 +780,7 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/nstream-openmp-target 10 16777216 # Fortran coarrays - make -C ${PRK_TARGET_PATH} coarray + ${MAKE} -C ${PRK_TARGET_PATH} coarray export PRK_MPI_PROCS=4 if [ "${CC}" = "gcc" ] ; then if [ "${TRAVIS_OS_NAME}" = "osx" ] ; then @@ -803,7 +825,7 @@ case "$PRK_TARGET" in echo "CC=$CC -std=c99" >> common/make.defs echo "OPENMPFLAG=-fopenmp" >> common/make.defs fi - make $PRK_TARGET + ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=OPENMP export OMP_NUM_THREADS=4 $PRK_TARGET_PATH/Synch_p2p/p2p $OMP_NUM_THREADS 10 1024 1024 @@ -870,7 +892,7 @@ case "$PRK_TARGET" in echo "OPENMPFLAG=-fopenmp" >> common/make.defs echo "MPI-1" - make allmpi1 + ${MAKE} allmpi1 export PRK_TARGET_PATH=MPI1 export PRK_MPI_PROCS=4 export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}" @@ -894,7 +916,7 @@ case "$PRK_TARGET" in # MPI+OpenMP is just too much of a pain with Clang right now. if [ "${CC}" = "gcc" ] ; then echo "MPI+OpenMP" - make allmpiomp + ${MAKE} allmpiomp export PRK_TARGET_PATH=MPIOPENMP export PRK_MPI_PROCS=2 export OMP_NUM_THREADS=2 @@ -906,7 +928,7 @@ case "$PRK_TARGET" in fi echo "MPI-RMA" - make allmpirma + ${MAKE} allmpirma export PRK_TARGET_PATH=MPIRMA export PRK_MPI_PROCS=4 export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}" @@ -915,7 +937,7 @@ case "$PRK_TARGET" in $PRK_RUN $PRK_TARGET_PATH/Transpose/transpose 10 1024 32 echo "MPI+MPI" - make allmpishm + ${MAKE} allmpishm export PRK_TARGET_PATH=MPISHM export PRK_MPI_PROCS=4 export PRK_RUN="$PRK_LAUNCHER -n $PRK_MPI_PROCS ${PRK_OVERSUBSCRIBE:-}" @@ -930,7 +952,7 @@ case "$PRK_TARGET" in export LD_LIBRARY_PATH=${TRAVIS_ROOT}/sandia-openshmem/lib:${TRAVIS_ROOT}/libfabric/lib:$LD_LIBRARY_PATH export SHMEM_ROOT=${TRAVIS_ROOT}/sandia-openshmem echo "SHMEMTOP=$SHMEM_ROOT\nSHMEMCC=$SHMEM_ROOT/bin/oshcc" >> common/make.defs - make $PRK_TARGET + ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=SHMEM export PRK_SHMEM_PROCS=4 export OSHRUN_LAUNCHER=${TRAVIS_ROOT}/hydra/bin/mpirun @@ -960,7 +982,7 @@ case "$PRK_TARGET" in echo "UPCC=$UPC_ROOT/bin/upc" >> common/make.defs export PRK_LAUNCHER="" export PRK_LAUNCHER_ARGS="-n $PRK_UPC_PROCS" - make $PRK_TARGET + ${MAKE} $PRK_TARGET ;; bupc) export UPC_ROOT=${TRAVIS_ROOT}/bupc-$CC @@ -987,7 +1009,7 @@ case "$PRK_TARGET" in export PRK_LAUNCHER="$UPC_ROOT/bin/upcrun -N 1 -n $PRK_UPC_PROCS -c $PRK_UPC_PROCS" ;; esac - make $PRK_TARGET PRK_FLAGS="-Wc,-O3" + ${MAKE} $PRK_TARGET PRK_FLAGS="-Wc,-O3" ;; *) echo "Invalid value of UPC_IMPL ($UPC_IMPL)" @@ -1013,7 +1035,7 @@ case "$PRK_TARGET" in ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs - make $PRK_TARGET PRK_FLAGS=-O3 + ${MAKE} $PRK_TARGET PRK_FLAGS=-O3 export PRK_TARGET_PATH=CHARM++ export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun @@ -1041,7 +1063,7 @@ case "$PRK_TARGET" in ;; esac echo "CHARMTOP=$CHARM_ROOT" >> common/make.defs - make $PRK_TARGET PRK_FLAGS="-O3 -std=gnu99" + ${MAKE} $PRK_TARGET PRK_FLAGS="-O3 -std=gnu99" export PRK_TARGET_PATH=AMPI export PRK_CHARM_PROCS=4 export PRK_LAUNCHER=$CHARM_ROOT/bin/charmrun @@ -1072,7 +1094,7 @@ case "$PRK_TARGET" in echo "Fine-Grain MPI (FG-MPI)" export FGMPI_ROOT=${TRAVIS_ROOT}/fgmpi echo "FGMPITOP=$FGMPI_ROOT\nFGMPICC=$FGMPI_ROOT/bin/mpicc -std=c99" >> common/make.defs - make $PRK_TARGET + ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=FG_MPI export PRK_MPI_PROCS=2 export PRK_FGMPI_THREADS=2 @@ -1099,7 +1121,7 @@ case "$PRK_TARGET" in export SCRIPT_PATH=${TRAVIS_ROOT}/grappa/bin ######################## echo "GRAPPATOP=${TRAVIS_ROOT}/grappa" >> common/make.defs - make $PRK_TARGET + ${MAKE} $PRK_TARGET export PRK_TARGET_PATH=GRAPPA export PRK_MPI_PROCS=2 export PRK_LAUNCHER=$MPI_ROOT/bin/mpirun @@ -1122,6 +1144,6 @@ case "$PRK_TARGET" in alllegion) echo "Legion" echo "LEGIONTOP=${TRAVIS_ROOT}/legion" > common/make.defs - make $PRK_TARGET -k + ${MAKE} $PRK_TARGET -k ;; esac From 283eca102910ce85794f843c38db21ee14487d00 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 20 Feb 2019 22:27:01 -0800 Subject: [PATCH 120/245] add PRK nstream for C11 (#369) * add PRK nstream for C11 * add MPI version * add MEMKIND nstream * add MEMKIND to example make.defs * add MMAP nstream --- .gitignore | 26 ++++- C1z/Makefile | 37 +++++- C1z/nstream-memkind.c | 228 +++++++++++++++++++++++++++++++++++++ C1z/nstream-mmap.c | 248 +++++++++++++++++++++++++++++++++++++++++ C1z/nstream-mpi.c | 216 +++++++++++++++++++++++++++++++++++ C1z/nstream.c | 183 ++++++++++++++++++++++++++++++ C1z/prk_util.h | 62 +++++++++-- common/make.defs.gcc | 2 + common/make.defs.intel | 2 + common/make.defs.llvm | 3 + 10 files changed, 991 insertions(+), 16 deletions(-) create mode 100644 C1z/nstream-memkind.c create mode 100644 C1z/nstream-mmap.c create mode 100644 C1z/nstream-mpi.c create mode 100644 C1z/nstream.c diff --git a/.gitignore b/.gitignore index 7ed1c8b8c..66948e148 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,8 @@ octave-workspace # Octave crashes */*/*.optrpt *__genmod.* # Intel Fortran compiler */*__genmod.mod +*.patch +*/*.patch common/make.defs scripts/small/runfgmpi @@ -95,6 +97,13 @@ SERIAL/Sparse/sparse SERIAL/Stencil/stencil SERIAL/Synch_p2p/p2p SERIAL/Transpose/transpose +C1z/nstream +C1z/nstream-openmp +C1z/nstream-mpi +C1z/nstream-memkind +C1z/nstream-memkind-openmp +C1z/nstream-mmap +C1z/nstream-mmap-openmp C1z/p2p C1z/p2p-innerloop C1z/p2p-innerloop-openmp @@ -131,9 +140,11 @@ Cxx11/p2p-vector-raja Cxx11/p2p-vector-tbb Cxx11/p2p-innerloop-openmp Cxx11/p2p-doacross-vector-openmp +Cxx11/p2p-doacross-openmp Cxx11/p2p-innerloop-opencl Cxx11/p2p-innerloop-vector Cxx11/p2p-hyperplane-vector +Cxx11/p2p-hyperplane-openmp Cxx11/p2p-hyperplane-vector-openmp Cxx11/p2p-innerloop-vector-tbb Cxx11/p2p-hyperplane-vector-stl @@ -167,6 +178,7 @@ Cxx11/stencil-vector-cilk Cxx11/stencil-vector-stl Cxx11/stencil-vector-pstl Cxx11/stencil-vector-raja +Cxx11/stencil-openmp Cxx11/stencil-raja Cxx11/stencil-vector-rangefor Cxx11/stencil-vector-tbb @@ -174,6 +186,7 @@ Cxx11/stencil-vector-taskloop Cxx11/stencil-kokkos Cxx11/stencil-cuda Cxx11/stencil-sycl +Cxx11/transpose-openmp Cxx11/transpose-opencl Cxx11/transpose-sycl Cxx11/transpose-openmp-target @@ -215,6 +228,14 @@ Cxx11/star6.cl Cxx11/star7.cl Cxx11/star8.cl Cxx11/star9.cl +Cxx11/star10.cl +Cxx11/hipSYCL +Cxx11/cpp-proposals-pub +Cxx11/ornl-mdspan +Cxx11/boost.tgz +Cxx11/boost.tbz +Cxx11/OpenCL-CLHPP +Cxx11/GSL FORTRAN/dgemm-taskloop-openmp FORTRAN/dgemm-pretty FORTRAN/dgemm-openmp @@ -256,8 +277,3 @@ FORTRAN/transpose-ornlacc RUST/p2p/Cargo.lock RUST/stencil/Cargo.lock RUST/transpose/Cargo.lock -nstream-openmp -p2p-doacross-openmp -p2p-hyperplane-openmp -stencil-openmp -transpose-openmp diff --git a/C1z/Makefile b/C1z/Makefile index 0df8225c1..aac123acc 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -10,6 +10,10 @@ ifdef VERBOSE CFLAGS += -DVERBOSE endif +ifdef PRK_USE_MMAP + CFLAGS += -DPRK_USE_MMAP +endif + ifeq ($(findstring musl,$(CC)),musl) CFLAGS += -DUSE_C11_THREADS endif @@ -41,11 +45,17 @@ endif all: serial thread openmp taskloop $(EXTRA) -serial: p2p p2p-innerloop stencil transpose +serial: nstream p2p p2p-innerloop stencil transpose thread: transpose-thread -openmp: p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp +openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp + +mpi: nstream-mpi + +memkind: nstream-memkind nstream-memkind-openmp + +mmap: nstream-mmap nstream-mmap-openmp target: stencil-target transpose-target @@ -58,12 +68,30 @@ ispc: transpose-ispc p2p-innerloop: p2p-innerloop-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ +%-mpi: %-mpi.c prk_util.h + $(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + +%-memkind: %-memkind.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) $(MEMKINDFLAGS) -o $@ + +%-memkind-openmp: %-memkind.c prk_util.h + $(CC) $(CFLAGS) $(OMPFLAGS) $< $(EXTRA_CLIBS) $(MEMKINDFLAGS) -o $@ + +%-mmap: %-mmap.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + +%-mmap-openmp: %-mmap.c prk_util.h + $(CC) $(CFLAGS) $(OMPFLAGS) $< $(EXTRA_CLIBS) -o $@ + %-target: %-target.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) $(EXTRA_CLIBS) -o $@ %-taskloop: %-taskloop.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ +nstream-openmp: nstream.c prk_util.h + $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ + %-openmp: %-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ @@ -92,13 +120,16 @@ clean: -rm -f *.optrpt -rm -f *.dwarf -rm -rf *.dSYM # Mac - -rm -f p2p p2p-innerloop stencil transpose + -rm -f nstream p2p p2p-innerloop stencil transpose -rm -f *-openmp + -rm -f *-mpi -rm -f *-target -rm -f *-taskloop -rm -f *-cilk -rm -f *-thread -rm -f *-ispc + -rm -f nstream-mmap nstream-memkind + -rm -f nstream-mmap-openmp nstream-memkind-openmp cleancl: -rm -f star[123456789].cl diff --git a/C1z/nstream-memkind.c b/C1z/nstream-memkind.c new file mode 100644 index 000000000..465f7a067 --- /dev/null +++ b/C1z/nstream-memkind.c @@ -0,0 +1,228 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include +#ifndef MEMKIND_PMEM_MIN_SIZE +# define MEMKIND_PMEM_MIN_SIZE (1024 * 1024 * 16) +#endif + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); +#ifdef _OPENMP + printf("C11/OpenMP STREAM triad: A = B + scalar * C\n"); +#else + printf("C11 STREAM triad: A = B + scalar * C\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Matrix length must be greater than 0\n"); + return 1; + } + +#ifdef _OPENMP + printf("Number of threads = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + + char * pool_path = getenv("PRK_MEMKIND_POOL_PATH"); + if (pool_path == NULL) { + pool_path = "/pmem"; + } + printf("MEMKIND pool path = %s\n", pool_path); + struct memkind * memkind_handle; + int err = memkind_create_pmem(pool_path, 0, &memkind_handle); + if (err) { + printf("MEMKIND failed to create a memory pool! (err=%d, errno=%d)\n", err, errno); + } + + size_t usable_size = 0; + + double * restrict A = memkind_malloc(memkind_handle, bytes); + if (A==NULL) { + printf("MEMKIND failed to allocate A! (errno=%d)\n", errno); + } + usable_size = memkind_malloc_usable_size(memkind_handle, A); + printf("A usage size = %zu\n", usable_size); + + double * restrict B = memkind_malloc(memkind_handle, bytes); + if (B==NULL) { + printf("MEMKIND failed to allocate B! (errno=%d)\n", errno); + } + usable_size = memkind_malloc_usable_size(memkind_handle, B); + printf("B usage size = %zu\n", usable_size); + + double * restrict C = memkind_malloc(memkind_handle, bytes); + if (C==NULL) { + printf("MEMKIND failed to allocate C! (errno=%d)\n", errno); + } + usable_size = memkind_malloc_usable_size(memkind_handle, C); + printf("C usage size = %zu\n", usable_size); + + double scalar = 3.0; + + OMP_PARALLEL() + { + OMP_FOR_SIMD() + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + memkind_free(memkind_handle, A); + memkind_free(memkind_handle, B); + memkind_free(memkind_handle, C); + + err = memkind_destroy_kind(memkind_handle); + if (err) { + printf("MEMKIND failed to create destroy a memory pool! (err=%d, errno=%d)\n", err, errno); + } + + return 0; +} + + diff --git a/C1z/nstream-mmap.c b/C1z/nstream-mmap.c new file mode 100644 index 000000000..fb0942c47 --- /dev/null +++ b/C1z/nstream-mmap.c @@ -0,0 +1,248 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include +#include +#include +#include +#include +#include + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); +#ifdef _OPENMP + printf("C11/OpenMP STREAM triad: A = B + scalar * C\n"); +#else + printf("C11 STREAM triad: A = B + scalar * C\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Matrix length must be greater than 0\n"); + return 1; + } + +#ifdef _OPENMP + printf("Number of threads = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + + char mmap_path[255] = {0}; + char * mmap_env = getenv("PRK_MMAP_PATH"); + fprintf(stderr, "PRK_MMAP_PATH=%s\n", mmap_env); + if (mmap_env==NULL) { + strcpy(mmap_path, "/tmp/prk_mmap"); + } else { + strcpy(mmap_path, mmap_env); + } + + fprintf(stderr, "mmap_path=%s\n", mmap_path); + int fd = open(mmap_path, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR); + if (fd == -1) { + fprintf(stderr, "open returned %d\n", fd); + char error_name[255] = {0}; + prk_lookup_posix_error(errno, error_name, 255); + printf("error name: %s\n", error_name); + abort(); + } + + int rc = ftruncate(fd, 3*bytes); + if (rc == -1) { + fprintf(stderr, "ftruncate returned %d\n", rc); + char error_name[255] = {0}; + prk_lookup_posix_error(errno, error_name, 255); + printf("error name: %s\n", error_name); + abort(); + } + + int flags = 0; + //flags |= MAP_PRIVATE; + flags |= MAP_SHARED; + //flags |= MAP_NORESERVE; + flags |= MAP_POPULATE; + //flags |= MAP_UNINITIALIZED; + //flags |= MAP_HUGETLB; + //flags |= MAP_HUGE_2MB; + //flags |= MAP_SYNC; + + double * ptr = (double*)mmap(NULL, 3*bytes, PROT_READ | PROT_WRITE, flags, fd, 0); + //double * ptr = (double*)mmap(NULL, 3*bytes, PROT_READ | PROT_WRITE, flags | MAP_ANONYMOUS, -1, 0); + if (ptr==MAP_FAILED || ptr==NULL) { + fprintf(stderr, "mmap returned %p, errno=%d\n", ptr, errno); + char error_name[255] = {0}; + prk_lookup_posix_error(errno, error_name, 255); + printf("error name: %s\n", error_name); + abort(); + } + + double * restrict A = &ptr[0]; + double * restrict B = &ptr[length]; + double * restrict C = &ptr[length*2]; + + double scalar = 3.0; + + OMP_PARALLEL() + { + OMP_FOR_SIMD() + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + int err = munmap(ptr, 3*bytes); + if (err) { + printf("munmap failed! (err=%d, errno=%d)\n", err, errno); + } + err = close(fd); + if (err) { + printf("close failed! (err=%d, errno=%d)\n", err, errno); + } + + return 0; +} + + diff --git a/C1z/nstream-mpi.c b/C1z/nstream-mpi.c new file mode 100644 index 000000000..438842859 --- /dev/null +++ b/C1z/nstream-mpi.c @@ -0,0 +1,216 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +#include + +int main(int argc, char * argv[]) +{ + int me, np; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &me); + MPI_Comm_size(MPI_COMM_WORLD, &np); + + if (me==0) { + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); + printf("C11/MPI STREAM triad: A = B + scalar * C\n"); + } + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + if (me==0) printf("Usage: <# iterations> \n"); + MPI_Finalize(); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + if (me==0) printf("ERROR: iterations must be >= 1\n"); + MPI_Finalize(); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + if (me==0) printf("ERROR: Matrix length must be greater than 0\n"); + MPI_Finalize(); + return 1; + } + + if (me==0) { + printf("Number of processes = %d\n", np); + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + } + + size_t local_length; + if (length % np == 0) { + local_length = length / np; + } else { + double x = (double)length / np; + size_t y = (size_t)ceil(x); + if (me != (np-1)) { + local_length = y; + } else { + local_length = length - y*(np-1); + } + } + //printf("Vector length (%4d) = %zu\n", me, local_length); + fflush(stdout); + MPI_Barrier(MPI_COMM_WORLD); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + double * restrict A; + double * restrict B; + double * restrict C; + + MPI_Win wA, wB, wC; + + size_t bytes = local_length*sizeof(double); + + MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&A, &wA); + MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&B, &wB); + MPI_Win_allocate_shared(bytes, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, (void**)&C, &wC); + + double scalar = 3.0; + + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + if (me==0) printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + if (me==0) printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + MPI_Win_free(&wA); + MPI_Win_free(&wB); + MPI_Win_free(&wC); + + MPI_Finalize(); + + return 0; +} + + diff --git a/C1z/nstream.c b/C1z/nstream.c new file mode 100644 index 000000000..7661662dc --- /dev/null +++ b/C1z/nstream.c @@ -0,0 +1,183 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); +#ifdef _OPENMP + printf("C11/OpenMP STREAM triad: A = B + scalar * C\n"); +#else + printf("C11 STREAM triad: A = B + scalar * C\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Matrix length must be greater than 0\n"); + return 1; + } + +#ifdef _OPENMP + printf("Number of threads = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + double * restrict A = prk_malloc(bytes); + double * restrict B = prk_malloc(bytes); + double * restrict C = prk_malloc(bytes); + + double scalar = 3.0; + + OMP_PARALLEL() + { + OMP_FOR_SIMD() + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + return 0; +} + + diff --git a/C1z/prk_util.h b/C1z/prk_util.h index 5d0831d34..24e428552 100644 --- a/C1z/prk_util.h +++ b/C1z/prk_util.h @@ -38,13 +38,11 @@ #define PRAGMA(x) _Pragma(#x) -// All of this is to get posix_memalign defined... -// #define _POSIX_C_SOURCE (200112L) -#define _POSIX_C_SOURCE (200809L) -#define _XOPEN_SOURCE 600 - #include // atoi #include // getenv + +int posix_memalign(void **memptr, size_t alignment, size_t size); + #include #if defined(__PGIC__) typedef _Bool bool; @@ -80,7 +78,7 @@ const bool false=0; # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) ) # if (_OPENMP >= 201300) # define OMP_SIMD PRAGMA(omp simd) -# define OMP_FOR_SIMD() PRAGMA(omp for simd x) +# define OMP_FOR_SIMD(x) PRAGMA(omp for simd x) # define OMP_TASK(x) PRAGMA(omp task x) # define OMP_TASKLOOP(x) PRAGMA(omp taskloop x ) # define OMP_TASKWAIT PRAGMA(omp taskwait) @@ -89,7 +87,7 @@ const bool false=0; # else # warning No OpenMP 4+ features! # define OMP_SIMD -# define OMP_FOR_SIMD() PRAGMA(omp for x) +# define OMP_FOR_SIMD(x) PRAGMA(omp for x) # define OMP_TASK(x) # define OMP_TASKLOOP(x) # define OMP_TASKWAIT @@ -105,7 +103,7 @@ const bool false=0; # define OMP_FOR(x) # define OMP_FOR_REDUCE(x) # define OMP_SIMD -# define OMP_FOR_SIMD() +# define OMP_FOR_SIMD(x) # define OMP_TASK(x) # define OMP_TASKLOOP(x) # define OMP_TASKWAIT @@ -298,4 +296,52 @@ static inline void prk_free(void * p) #endif } +static inline void prk_lookup_posix_error(int e, char * n, int l) +{ + switch (e) { + case EACCES: + strncpy(n,"EACCES",l); + break; + case EAGAIN: + strncpy(n,"EAGAIN",l); + break; + case EBADF: + strncpy(n,"EBADF",l); + break; + case EEXIST: + strncpy(n,"EEXIST",l); + break; + case EINVAL: + strncpy(n,"EINVAL",l); + break; + case ENFILE: + strncpy(n,"ENFILE",l); + break; + case ENODEV: + strncpy(n,"ENODEV",l); + break; + case ENOMEM: + strncpy(n,"ENOMEM",l); + break; + case EPERM: + strncpy(n,"EPERM",l); + break; + case ETXTBSY: + strncpy(n,"ETXTBSY",l); + break; + case EOPNOTSUPP: + strncpy(n,"EOPNOTSUPP",l); + break; + /* + case E: + strncpy(n,"E",l); + break; + */ + default: + printf("error code %d unknown\n", e); + strncpy(n,"UNKNOWN",l); + break; + } +} + #endif /* PRK_UTIL_H */ diff --git a/common/make.defs.gcc b/common/make.defs.gcc index e1d68dd5a..0c30fea7f 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -115,3 +115,5 @@ COARRAYFLAG=-fcoarray=single -lcaf_single # multi-node # COARRAYFLAG=-fcoarray=lib -lcaf_mpi +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib diff --git a/common/make.defs.intel b/common/make.defs.intel index bba53d1bb..17a4c2833 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -116,3 +116,5 @@ COARRAYFLAG=-coarray # multi-node # COARRAYFLAG=-coarray=distributed +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib diff --git a/common/make.defs.llvm b/common/make.defs.llvm index a5c9010d4..c8aa874ea 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -129,3 +129,6 @@ ISPCFLAG=-O3 --target=host --opt=fast-math # # We assume you have installed an implementation of MPI-3 that is in your path. MPICC=mpicc + +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From 75bc803dacb1edb3330baaf4826be83fed2411cc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 2 Mar 2019 21:09:14 -0800 Subject: [PATCH 121/245] Sycl multi device and exceptions (#347) * triSYCL needs C++17 * fix Julia syntax issue "1./" is a syntax error now. change to "1.0/" * do to SYCL what we have for OpenCL * fix name mangling issue - thanks Rod@CodePlay! * run 32b for all devices unconditionally * label result with precision * hard-code SYCL to CPU execution only due to GPU issues the bandwidth reported is consistent for elements, not bytes, which means that something is wrong. 64b data should not lead to BW that is 2x 32b data... * add host, catch std exception * c++1z instead of c++17 * fix use of ranges in SYCL * correct sycl ranges fix * better example flags --- Cxx11/Makefile | 2 +- Cxx11/nstream-opencl.cc | 12 ++- Cxx11/nstream-sycl.cc | 178 +++++++++++++++++++++++++++------------- common/make.defs.gcc | 22 ++++- common/make.defs.llvm | 18 ++-- 5 files changed, 154 insertions(+), 78 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 54873e41d..65875fba4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -36,7 +36,6 @@ TARGETFLAGS = $(OFFLOADFLAG) OPENCLFLAGS = $(OPENCLFLAG) # We do not yet handle all possible exceptions... #OPENCLFLAGS += -D__CL_ENABLE_EXCEPTIONS -SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) TBBFLAGS = $(TBBFLAG) -DUSE_TBB -DPRK_TBB_PARTITIONER=$(PRK_TBB_PARTITIONER) CBLASFLAGS = $(CBLASFLAG) $(OPENMPFLAG) @@ -47,6 +46,7 @@ PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS +SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 $(RANGEFLAGS) ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 18a5a022c..b0241dd5d 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -141,7 +141,8 @@ void run(cl::Context context, int iterations, size_t length) std::cout << "Solution validates" << std::endl; double avgtime = nstream_time/iterations; double nbytes = 4.0 * length * sizeof(T); - std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + std::cout << precision << "B " + << "Rate (MB/s): " << 1.e-6*nbytes/avgtime << " Avg time (s): " << avgtime << std::endl; } } @@ -200,9 +201,8 @@ int main(int argc, char* argv[]) if (precision==64) { run(cpu, iterations, length); - } else { - run(cpu, iterations, length); } + run(cpu, iterations, length); } cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); @@ -214,9 +214,8 @@ int main(int argc, char* argv[]) if (precision==64) { run(gpu, iterations, length); - } else { - run(gpu, iterations, length); } + run(gpu, iterations, length); } cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); @@ -229,9 +228,8 @@ int main(int argc, char* argv[]) if (precision==64) { run(acc, iterations, length); - } else { - run(acc, iterations, length); } + run(acc, iterations, length); } return 0; diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 2193d4811..b4c056990 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -66,66 +66,32 @@ #include "prk_util.h" -int main(int argc, char * argv[]) -{ - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations, offset; - size_t length; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - length = std::atol(argv[2]); - if (length <= 0) { - throw "ERROR: vector length must be positive"; - } - - offset = (argc>3) ? std::atoi(argv[3]) : 0; - if (length <= 0) { - throw "ERROR: offset must be nonnegative"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Vector length = " << length << std::endl; - std::cout << "Offset = " << offset << std::endl; - - // SYCL device queue - cl::sycl::queue q; +// need to declare kernel class as template +// to prevent name mangling conflict below +template class nstream; +template +void run(cl::sycl::queue & q, int iterations, size_t length) +{ ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// double nstream_time(0); - std::vector h_A(length,0); - std::vector h_B(length,2); - std::vector h_C(length,2); + std::vector h_A(length); + std::vector h_B(length); + std::vector h_C(length); + + auto range = prk::range(static_cast(0), length); - double const scalar(3); + const T scalar(3); - { - // initialize device buffers from host buffers - cl::sycl::buffer d_A { h_A.data(), h_A.size() }; - cl::sycl::buffer d_B { h_B.data(), h_B.size() }; - cl::sycl::buffer d_C { h_C.data(), h_C.size() }; + try { + + cl::sycl::buffer d_A { h_A.data(), h_A.size() }; + cl::sycl::buffer d_B { h_B.data(), h_B.size() }; + cl::sycl::buffer d_C { h_C.data(), h_C.size() }; for (int iter = 0; iter<=iterations; ++iter) { @@ -133,12 +99,11 @@ int main(int argc, char * argv[]) q.submit([&](cl::sycl::handler& h) { - // accessor methods - auto A = d_A.get_access(h); - auto B = d_B.get_access(h); - auto C = d_C.get_access(h); + auto A = d_A.template get_access(h); + auto B = d_B.template get_access(h); + auto C = d_C.template get_access(h); - h.parallel_for(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + h.parallel_for>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { A[i] += B[i] + scalar * C[i]; }); }); @@ -150,6 +115,10 @@ int main(int argc, char * argv[]) // for other device-oriented programming models. nstream_time = prk::wtime() - nstream_time; } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -175,14 +144,105 @@ int main(int argc, char * argv[]) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; - return 1; } else { std::cout << "Solution validates" << std::endl; double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(double); - std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + double nbytes = 4.0 * length * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.e-6*nbytes/avgtime << " Avg time (s): " << avgtime << std::endl; } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + try { + + if (1) { + cl::sycl::queue host(cl::sycl::host_selector{}); + auto device = host.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + + run(host, iterations, length); + run(host, iterations, length); + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); + auto device = cpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + + run(cpu, iterations, length); + run(cpu, iterations, length); + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (0) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); + auto device = gpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + + run(gpu, iterations, length); + run(gpu, iterations, length); + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } return 0; } diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 0c30fea7f..5f3f62f03 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -26,7 +26,9 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. # #DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed -DEFAULT_OPT_FLAGS+=-Wall +DEFAULT_OPT_FLAGS+=-Wall #-Werror +DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-mavx -mfma # # OpenMP flags # @@ -43,6 +45,7 @@ OPENCLFLAG=-framework OpenCL #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders # # SYCL flags # @@ -67,12 +70,12 @@ SYCLFLAG=-I$(SYCLDIR)/include # # TBB # -TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 +TBBDIR=/usr/local/Cellar/tbb/2019_U3_1 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG=-I/usr/local/Cellar/boost/1.68.0_1/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} @@ -83,6 +86,19 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} +SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} +# ProGTX +# https://github.com/ProGTX/sycl-gtx +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +# # CBLAS for C++ DGEMM # CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions diff --git a/common/make.defs.llvm b/common/make.defs.llvm index c8aa874ea..4929aa0bb 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -4,8 +4,8 @@ # # Base compilers and language options # -LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0 -LLVM_PATH=${LLVM_ROOT}/bin/ +#LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0 +#LLVM_PATH=${LLVM_ROOT}/bin/ #LLVM_PATH=/opt/llvm/HEAD/bin/ # C99 is required in some implementations. CC=${LLVM_PATH}clang -std=c11 -pthread @@ -47,13 +47,13 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # OpenCL flags # # MacOS -OPENCLFLAG=-framework OpenCL +#OPENCLFLAG=-framework OpenCL # POCL # http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux #OPENCLDIR=/etc/alternatives/opencl-intel-tools -#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations # # SYCL flags # @@ -76,8 +76,8 @@ SYCLFLAG+=-std=c++14 # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} -SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include $(DEFAULT_OPT_FLAGS) +SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -92,13 +92,15 @@ OCCADIR=${HOME}/prk-repo/Cxx11/occa # TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb +#TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang From 3caffa78582bd47ebb47e1eb5bca51b36d4b52e6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 5 Mar 2019 13:24:21 -0800 Subject: [PATCH 122/245] OpenCL: add No Device errors (#373) * add No Device errors * errno needs to be included unconditionally --- C1z/prk_util.h | 2 +- Cxx11/nstream-opencl.cc | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/C1z/prk_util.h b/C1z/prk_util.h index 24e428552..313cca471 100644 --- a/C1z/prk_util.h +++ b/C1z/prk_util.h @@ -56,6 +56,7 @@ const bool false=0; #include // fabs #include // clock_gettime, timespec_get #include +#include #ifndef MIN #define MIN(x,y) ((x)<(y)?(x):(y)) @@ -142,7 +143,6 @@ int __cilkrts_get_nworkers(void); # include #else # define HAVE_PTHREADS -# include # include #endif diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index b0241dd5d..9d81f1b8d 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -203,6 +203,8 @@ int main(int argc, char* argv[]) run(cpu, iterations, length); } run(cpu, iterations, length); + } else { + std::cerr << "No CPU" << std::endl; } cl::Context gpu(CL_DEVICE_TYPE_GPU, NULL, NULL, NULL, &err); @@ -216,6 +218,8 @@ int main(int argc, char* argv[]) run(gpu, iterations, length); } run(gpu, iterations, length); + } else { + std::cerr << "No GPU" << std::endl; } cl::Context acc(CL_DEVICE_TYPE_ACCELERATOR, NULL, NULL, NULL, &err); @@ -230,6 +234,8 @@ int main(int argc, char* argv[]) run(acc, iterations, length); } run(acc, iterations, length); + } else { + std::cerr << "No ACC" << std::endl; } return 0; From 8b626e1bd7b14e9c0115a50d6051c314b97656ff Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 2 Mar 2019 21:34:47 -0800 Subject: [PATCH 123/245] fix nstream correctness by initializing host vectors --- Cxx11/nstream-sycl.cc | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index b4c056990..2f7e64253 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -79,9 +79,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length) double nstream_time(0); - std::vector h_A(length); - std::vector h_B(length); - std::vector h_C(length); + std::vector h_A(length,0); + std::vector h_B(length,2); + std::vector h_C(length,2); auto range = prk::range(static_cast(0), length); @@ -124,9 +124,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - double ar(0); - double br(2); - double cr(2); + T ar(0); + T br(2); + T cr(2); for (int i=0; i<=iterations; ++i) { ar += br + scalar * cr; } @@ -138,7 +138,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) asum += std::fabs(h_A[i]); } - double epsilon(1.e-8); + const double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" << " Expected checksum: " << ar << "\n" @@ -202,10 +202,12 @@ int main(int argc, char * argv[]) if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL auto device = host.get_device(); - auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif run(host, iterations, length); run(host, iterations, length); @@ -214,11 +216,13 @@ int main(int argc, char * argv[]) // CPU requires spir64 target if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL auto device = cpu.get_device(); - auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif run(cpu, iterations, length); run(cpu, iterations, length); @@ -227,11 +231,13 @@ int main(int argc, char * argv[]) // NVIDIA GPU requires ptx64 target and does not work very well if (0) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL auto device = gpu.get_device(); - auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif run(gpu, iterations, length); run(gpu, iterations, length); From 40fa3819140ff90bc23a4058aecf6475ecc24d40 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 2 Mar 2019 21:34:59 -0800 Subject: [PATCH 124/245] make transpose-sycl multi-device etc --- Cxx11/transpose-sycl.cc | 180 +++++++++++++++++++++++++++------------- 1 file changed, 123 insertions(+), 57 deletions(-) diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 1c8489806..d7b33e866 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -53,66 +53,33 @@ #include "prk_util.h" -int main(int argc, char * argv[]) -{ - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations; - size_t order; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - // number of times to do the transpose - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - // order of a the matrix - order = std::atoi(argv[2]); - if (order <= 0) { - throw "ERROR: Matrix Order must be greater than 0"; - } else if (order > std::floor(std::sqrt(INT_MAX))) { - throw "ERROR: matrix dimension too large - overflow risk"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; +// need to declare kernel class as template +// to prevent name mangling conflict below +template class transpose; +template +void run(cl::sycl::queue & q, int iterations, size_t order) +{ ////////////////////////////////////////////////////////////////////// /// Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// double trans_time(0); - std::vector h_A(order*order); - std::vector h_B(order*order,0.0); + std::vector h_A(order*order); + std::vector h_B(order*order,static_cast(0)); // fill A with the sequence 0 to order^2-1 as doubles - std::iota(h_A.begin(), h_A.end(), 0.0); + std::iota(h_A.begin(), h_A.end(), static_cast(0)); + + try { - // SYCL device queue - cl::sycl::queue q; - { - // initialize device buffers from host buffers #if USE_2D_INDEXING cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); #else - cl::sycl::buffer d_A { h_A.data(), h_A.size() }; - cl::sycl::buffer d_B { h_B.data(), h_B.size() }; + cl::sycl::buffer d_A { h_A.data(), h_A.size() }; + cl::sycl::buffer d_B { h_B.data(), h_B.size() }; #endif for (int iter = 0; iter<=iterations; ++iter) { @@ -122,19 +89,19 @@ int main(int argc, char * argv[]) q.submit([&](cl::sycl::handler& h) { // accessor methods - auto A = d_A.get_access(h); - auto B = d_B.get_access(h); + auto A = d_A.template get_access(h); + auto B = d_B.template get_access(h); // transpose - h.parallel_for(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { #if USE_2D_INDEXING cl::sycl::id<2> ij{it[0],it[1]}; cl::sycl::id<2> ji{it[1],it[0]}; B[ij] += A[ji]; - A[ji] += 1.0; + A[ji] += static_cast(1); #else B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; - A[it[1] * order + it[0]] += 1.0; + A[it[1] * order + it[0]] += static_cast(1); #endif }); }); @@ -146,19 +113,23 @@ int main(int argc, char * argv[]) // for other device-oriented programming models. trans_time = prk::wtime() - trans_time; } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results ////////////////////////////////////////////////////////////////////// // TODO: replace with std::generate, std::accumulate, or similar - double const addit = (iterations+1.) * (iterations/2.); + const T addit = (iterations+1.) * (iterations/2.); double abserr(0); for (size_t i=0; i(ij)*(1.+iterations)+addit; + const T reference = static_cast(ij)*(1.+iterations)+addit; abserr += std::fabs(h_B[ji] - reference); } } @@ -167,19 +138,114 @@ int main(int argc, char * argv[]) std::cout << "Sum of absolute differences: " << abserr << std::endl; #endif - double const epsilon(1.0e-8); + const double epsilon(1.0e-8); if (abserr < epsilon) { std::cout << "Solution validates" << std::endl; - auto avgtime = trans_time/iterations; - auto bytes = (size_t)order * (size_t)order * sizeof(double); - std::cout << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + double avgtime = trans_time/iterations; + double bytes = (size_t)order * (size_t)order * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime << " Avg time (s): " << avgtime << std::endl; } else { std::cout << "ERROR: Aggregate squared error " << abserr << " exceeds threshold " << epsilon << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; return 1; } + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + try { + + if (1) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + + run(host, iterations, order); + run(host, iterations, order); + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(cpu, iterations, order); + run(cpu, iterations, order); + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (0) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(gpu, iterations, order); + run(gpu, iterations, order); + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } + return 0; } From f9425e9c2719e80172d56290c8da0de09154a146 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 2 Mar 2019 21:42:45 -0800 Subject: [PATCH 125/245] templatize stencil sycl kernel over type --- Cxx11/generate-sycl-stencil.py | 33 ++-- Cxx11/stencil_sycl.hpp | 270 +++++++++++++++++---------------- 2 files changed, 157 insertions(+), 146 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 1c71ff03c..6f797dac7 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -6,13 +6,14 @@ import os def codegen(src,pattern,stencil_size,radius,model,dim): + src.write('template \n') src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ') if (dim==2): - src.write('cl::sycl::buffer & d_in, ') - src.write('cl::sycl::buffer & d_out)\n') + src.write('cl::sycl::buffer & d_in, ') + src.write('cl::sycl::buffer & d_out)\n') else: - src.write('cl::sycl::buffer & d_in, ') - src.write('cl::sycl::buffer & d_out)\n') + src.write('cl::sycl::buffer & d_in, ') + src.write('cl::sycl::buffer & d_out)\n') src.write('{\n') src.write(' q.submit([&](cl::sycl::handler& h) {\n') src.write(' auto in = d_in.get_access(h);\n') @@ -41,36 +42,36 @@ def codegen(src,pattern,stencil_size,radius,model,dim): if i > 1: src.write('\n') src.write(19*' ') - src.write('+in[xy+dx'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('+in[xy+dx'+str(i)+'] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+19*' ') - src.write('+in[xy-dx'+str(i)+'] * '+str(-1./(2.*i*radius))) + src.write('+in[xy-dx'+str(i)+'] * static_cast('+str(-1./(2.*i*radius))+')') src.write('\n'+19*' ') - src.write('+in[xy+dy'+str(i)+'] * '+str(+1./(2.*i*radius))) + src.write('+in[xy+dy'+str(i)+'] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+19*' ') - src.write('+in[xy-dy'+str(i)+'] * '+str(-1./(2.*i*radius))) + src.write('+in[xy-dy'+str(i)+'] * static_cast('+str(-1./(2.*i*radius))+')') else: # 1D indexing the slow way #if i > 1: # src.write('\n') # src.write(22*' ') - #src.write('+in[i*n+(j+'+str(i)+')] * '+str(+1./(2.*i*radius))) + #src.write('+in[i*n+(j+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') #src.write('\n'+22*' ') - #src.write('+in[i*n+(j-'+str(i)+')] * '+str(-1./(2.*i*radius))) + #src.write('+in[i*n+(j-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') #src.write('\n'+22*' ') - #src.write('+in[(i+'+str(i)+')*n+j] * '+str(+1./(2.*i*radius))) + #src.write('+in[(i+'+str(i)+')*n+j] * static_cast('+str(+1./(2.*i*radius))+')') #src.write('\n'+22*' ') - #src.write('+in[(i-'+str(i)+')*n+j] * '+str(-1./(2.*i*radius))) + #src.write('+in[(i-'+str(i)+')*n+j] * static_cast('+str(-1./(2.*i*radius))+')') # 1D indexing the fast way if i > 1: src.write('\n') src.write(30*' ') - src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * '+str(+1./(2.*i*radius))) + src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * '+str(-1./(2.*i*radius))) + src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * '+str(+1./(2.*i*radius))) + src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * '+str(-1./(2.*i*radius))) + src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * static_cast('+str(-1./(2.*i*radius))+')') if i == radius: src.write(';\n') else: diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 6fbf8d9f7..c8a9d0a5b 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,18 +1,20 @@ -void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.5 - +in[it[0]*n+(it[1]-1)] * -0.5 - +in[(it[0]+1)*n+it[1]] * 0.5 - +in[(it[0]-1)*n+it[1]] * -0.5; + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.5) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.5) + +in[(it[0]+1)*n+it[1]] * static_cast(0.5) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.5); }); }); } -void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); @@ -21,33 +23,35 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.5 - +in[xy-dx1] * -0.5 - +in[xy+dy1] * 0.5 - +in[xy-dy1] * -0.5; + out[xy] += +in[xy+dx1] * static_cast(0.5) + +in[xy-dx1] * static_cast(-0.5) + +in[xy+dy1] * static_cast(0.5) + +in[xy-dy1] * static_cast(-0.5); }); }); } -void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.25 - +in[it[0]*n+(it[1]-1)] * -0.25 - +in[(it[0]+1)*n+it[1]] * 0.25 - +in[(it[0]-1)*n+it[1]] * -0.25 - +in[it[0]*n+(it[1]+2)] * 0.125 - +in[it[0]*n+(it[1]-2)] * -0.125 - +in[(it[0]+2)*n+it[1]] * 0.125 - +in[(it[0]-2)*n+it[1]] * -0.125; + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.25) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.25) + +in[(it[0]+1)*n+it[1]] * static_cast(0.25) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.25) + +in[it[0]*n+(it[1]+2)] * static_cast(0.125) + +in[it[0]*n+(it[1]-2)] * static_cast(-0.125) + +in[(it[0]+2)*n+it[1]] * static_cast(0.125) + +in[(it[0]-2)*n+it[1]] * static_cast(-0.125); }); }); } -void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); @@ -58,41 +62,43 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.25 - +in[xy-dx1] * -0.25 - +in[xy+dy1] * 0.25 - +in[xy-dy1] * -0.25 - +in[xy+dx2] * 0.125 - +in[xy-dx2] * -0.125 - +in[xy+dy2] * 0.125 - +in[xy-dy2] * -0.125; + out[xy] += +in[xy+dx1] * static_cast(0.25) + +in[xy-dx1] * static_cast(-0.25) + +in[xy+dy1] * static_cast(0.25) + +in[xy-dy1] * static_cast(-0.25) + +in[xy+dx2] * static_cast(0.125) + +in[xy-dx2] * static_cast(-0.125) + +in[xy+dy2] * static_cast(0.125) + +in[xy-dy2] * static_cast(-0.125); }); }); } -void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.16666666666666666 - +in[it[0]*n+(it[1]-1)] * -0.16666666666666666 - +in[(it[0]+1)*n+it[1]] * 0.16666666666666666 - +in[(it[0]-1)*n+it[1]] * -0.16666666666666666 - +in[it[0]*n+(it[1]+2)] * 0.08333333333333333 - +in[it[0]*n+(it[1]-2)] * -0.08333333333333333 - +in[(it[0]+2)*n+it[1]] * 0.08333333333333333 - +in[(it[0]-2)*n+it[1]] * -0.08333333333333333 - +in[it[0]*n+(it[1]+3)] * 0.05555555555555555 - +in[it[0]*n+(it[1]-3)] * -0.05555555555555555 - +in[(it[0]+3)*n+it[1]] * 0.05555555555555555 - +in[(it[0]-3)*n+it[1]] * -0.05555555555555555; + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.166666666667) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.166666666667) + +in[(it[0]+1)*n+it[1]] * static_cast(0.166666666667) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.166666666667) + +in[it[0]*n+(it[1]+2)] * static_cast(0.0833333333333) + +in[it[0]*n+(it[1]-2)] * static_cast(-0.0833333333333) + +in[(it[0]+2)*n+it[1]] * static_cast(0.0833333333333) + +in[(it[0]-2)*n+it[1]] * static_cast(-0.0833333333333) + +in[it[0]*n+(it[1]+3)] * static_cast(0.0555555555556) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.0555555555556) + +in[(it[0]+3)*n+it[1]] * static_cast(0.0555555555556) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.0555555555556); }); }); } -void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); @@ -105,49 +111,51 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.16666666666666666 - +in[xy-dx1] * -0.16666666666666666 - +in[xy+dy1] * 0.16666666666666666 - +in[xy-dy1] * -0.16666666666666666 - +in[xy+dx2] * 0.08333333333333333 - +in[xy-dx2] * -0.08333333333333333 - +in[xy+dy2] * 0.08333333333333333 - +in[xy-dy2] * -0.08333333333333333 - +in[xy+dx3] * 0.05555555555555555 - +in[xy-dx3] * -0.05555555555555555 - +in[xy+dy3] * 0.05555555555555555 - +in[xy-dy3] * -0.05555555555555555; + out[xy] += +in[xy+dx1] * static_cast(0.166666666667) + +in[xy-dx1] * static_cast(-0.166666666667) + +in[xy+dy1] * static_cast(0.166666666667) + +in[xy-dy1] * static_cast(-0.166666666667) + +in[xy+dx2] * static_cast(0.0833333333333) + +in[xy-dx2] * static_cast(-0.0833333333333) + +in[xy+dy2] * static_cast(0.0833333333333) + +in[xy-dy2] * static_cast(-0.0833333333333) + +in[xy+dx3] * static_cast(0.0555555555556) + +in[xy-dx3] * static_cast(-0.0555555555556) + +in[xy+dy3] * static_cast(0.0555555555556) + +in[xy-dy3] * static_cast(-0.0555555555556); }); }); } -void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.125 - +in[it[0]*n+(it[1]-1)] * -0.125 - +in[(it[0]+1)*n+it[1]] * 0.125 - +in[(it[0]-1)*n+it[1]] * -0.125 - +in[it[0]*n+(it[1]+2)] * 0.0625 - +in[it[0]*n+(it[1]-2)] * -0.0625 - +in[(it[0]+2)*n+it[1]] * 0.0625 - +in[(it[0]-2)*n+it[1]] * -0.0625 - +in[it[0]*n+(it[1]+3)] * 0.041666666666666664 - +in[it[0]*n+(it[1]-3)] * -0.041666666666666664 - +in[(it[0]+3)*n+it[1]] * 0.041666666666666664 - +in[(it[0]-3)*n+it[1]] * -0.041666666666666664 - +in[it[0]*n+(it[1]+4)] * 0.03125 - +in[it[0]*n+(it[1]-4)] * -0.03125 - +in[(it[0]+4)*n+it[1]] * 0.03125 - +in[(it[0]-4)*n+it[1]] * -0.03125; + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.125) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.125) + +in[(it[0]+1)*n+it[1]] * static_cast(0.125) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.125) + +in[it[0]*n+(it[1]+2)] * static_cast(0.0625) + +in[it[0]*n+(it[1]-2)] * static_cast(-0.0625) + +in[(it[0]+2)*n+it[1]] * static_cast(0.0625) + +in[(it[0]-2)*n+it[1]] * static_cast(-0.0625) + +in[it[0]*n+(it[1]+3)] * static_cast(0.0416666666667) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.0416666666667) + +in[(it[0]+3)*n+it[1]] * static_cast(0.0416666666667) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.0416666666667) + +in[it[0]*n+(it[1]+4)] * static_cast(0.03125) + +in[it[0]*n+(it[1]-4)] * static_cast(-0.03125) + +in[(it[0]+4)*n+it[1]] * static_cast(0.03125) + +in[(it[0]-4)*n+it[1]] * static_cast(-0.03125); }); }); } -void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); @@ -162,57 +170,59 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.125 - +in[xy-dx1] * -0.125 - +in[xy+dy1] * 0.125 - +in[xy-dy1] * -0.125 - +in[xy+dx2] * 0.0625 - +in[xy-dx2] * -0.0625 - +in[xy+dy2] * 0.0625 - +in[xy-dy2] * -0.0625 - +in[xy+dx3] * 0.041666666666666664 - +in[xy-dx3] * -0.041666666666666664 - +in[xy+dy3] * 0.041666666666666664 - +in[xy-dy3] * -0.041666666666666664 - +in[xy+dx4] * 0.03125 - +in[xy-dx4] * -0.03125 - +in[xy+dy4] * 0.03125 - +in[xy-dy4] * -0.03125; + out[xy] += +in[xy+dx1] * static_cast(0.125) + +in[xy-dx1] * static_cast(-0.125) + +in[xy+dy1] * static_cast(0.125) + +in[xy-dy1] * static_cast(-0.125) + +in[xy+dx2] * static_cast(0.0625) + +in[xy-dx2] * static_cast(-0.0625) + +in[xy+dy2] * static_cast(0.0625) + +in[xy-dy2] * static_cast(-0.0625) + +in[xy+dx3] * static_cast(0.0416666666667) + +in[xy-dx3] * static_cast(-0.0416666666667) + +in[xy+dy3] * static_cast(0.0416666666667) + +in[xy-dy3] * static_cast(-0.0416666666667) + +in[xy+dx4] * static_cast(0.03125) + +in[xy-dx4] * static_cast(-0.03125) + +in[xy+dy4] * static_cast(0.03125) + +in[xy-dy4] * static_cast(-0.03125); }); }); } -void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); auto out = d_out.get_access(h); h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * 0.1 - +in[it[0]*n+(it[1]-1)] * -0.1 - +in[(it[0]+1)*n+it[1]] * 0.1 - +in[(it[0]-1)*n+it[1]] * -0.1 - +in[it[0]*n+(it[1]+2)] * 0.05 - +in[it[0]*n+(it[1]-2)] * -0.05 - +in[(it[0]+2)*n+it[1]] * 0.05 - +in[(it[0]-2)*n+it[1]] * -0.05 - +in[it[0]*n+(it[1]+3)] * 0.03333333333333333 - +in[it[0]*n+(it[1]-3)] * -0.03333333333333333 - +in[(it[0]+3)*n+it[1]] * 0.03333333333333333 - +in[(it[0]-3)*n+it[1]] * -0.03333333333333333 - +in[it[0]*n+(it[1]+4)] * 0.025 - +in[it[0]*n+(it[1]-4)] * -0.025 - +in[(it[0]+4)*n+it[1]] * 0.025 - +in[(it[0]-4)*n+it[1]] * -0.025 - +in[it[0]*n+(it[1]+5)] * 0.02 - +in[it[0]*n+(it[1]-5)] * -0.02 - +in[(it[0]+5)*n+it[1]] * 0.02 - +in[(it[0]-5)*n+it[1]] * -0.02; + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.1) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.1) + +in[(it[0]+1)*n+it[1]] * static_cast(0.1) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.1) + +in[it[0]*n+(it[1]+2)] * static_cast(0.05) + +in[it[0]*n+(it[1]-2)] * static_cast(-0.05) + +in[(it[0]+2)*n+it[1]] * static_cast(0.05) + +in[(it[0]-2)*n+it[1]] * static_cast(-0.05) + +in[it[0]*n+(it[1]+3)] * static_cast(0.0333333333333) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.0333333333333) + +in[(it[0]+3)*n+it[1]] * static_cast(0.0333333333333) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.0333333333333) + +in[it[0]*n+(it[1]+4)] * static_cast(0.025) + +in[it[0]*n+(it[1]-4)] * static_cast(-0.025) + +in[(it[0]+4)*n+it[1]] * static_cast(0.025) + +in[(it[0]-4)*n+it[1]] * static_cast(-0.025) + +in[it[0]*n+(it[1]+5)] * static_cast(0.02) + +in[it[0]*n+(it[1]-5)] * static_cast(-0.02) + +in[(it[0]+5)*n+it[1]] * static_cast(0.02) + +in[(it[0]-5)*n+it[1]] * static_cast(-0.02); }); }); } -void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.get_access(h); @@ -229,26 +239,26 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_ cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * 0.1 - +in[xy-dx1] * -0.1 - +in[xy+dy1] * 0.1 - +in[xy-dy1] * -0.1 - +in[xy+dx2] * 0.05 - +in[xy-dx2] * -0.05 - +in[xy+dy2] * 0.05 - +in[xy-dy2] * -0.05 - +in[xy+dx3] * 0.03333333333333333 - +in[xy-dx3] * -0.03333333333333333 - +in[xy+dy3] * 0.03333333333333333 - +in[xy-dy3] * -0.03333333333333333 - +in[xy+dx4] * 0.025 - +in[xy-dx4] * -0.025 - +in[xy+dy4] * 0.025 - +in[xy-dy4] * -0.025 - +in[xy+dx5] * 0.02 - +in[xy-dx5] * -0.02 - +in[xy+dy5] * 0.02 - +in[xy-dy5] * -0.02; + out[xy] += +in[xy+dx1] * static_cast(0.1) + +in[xy-dx1] * static_cast(-0.1) + +in[xy+dy1] * static_cast(0.1) + +in[xy-dy1] * static_cast(-0.1) + +in[xy+dx2] * static_cast(0.05) + +in[xy-dx2] * static_cast(-0.05) + +in[xy+dy2] * static_cast(0.05) + +in[xy-dy2] * static_cast(-0.05) + +in[xy+dx3] * static_cast(0.0333333333333) + +in[xy-dx3] * static_cast(-0.0333333333333) + +in[xy+dy3] * static_cast(0.0333333333333) + +in[xy-dy3] * static_cast(-0.0333333333333) + +in[xy+dx4] * static_cast(0.025) + +in[xy-dx4] * static_cast(-0.025) + +in[xy+dy4] * static_cast(0.025) + +in[xy-dy4] * static_cast(-0.025) + +in[xy+dx5] * static_cast(0.02) + +in[xy-dx5] * static_cast(-0.02) + +in[xy+dy5] * static_cast(0.02) + +in[xy-dy5] * static_cast(-0.02); }); }); } From 19d457cf04c17a485897c306aa79db20a708520f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 2 Mar 2019 21:59:31 -0800 Subject: [PATCH 126/245] SYCL stencil now templated --- Cxx11/generate-sycl-stencil.py | 4 +- Cxx11/nstream-sycl.cc | 4 + Cxx11/stencil-sycl.cc | 267 +++++++++++++++++++++------------ Cxx11/stencil_sycl.hpp | 40 ++--- Cxx11/transpose-sycl.cc | 4 + 5 files changed, 201 insertions(+), 118 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index 6f797dac7..e1857e8c2 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -16,8 +16,8 @@ def codegen(src,pattern,stencil_size,radius,model,dim): src.write('cl::sycl::buffer & d_out)\n') src.write('{\n') src.write(' q.submit([&](cl::sycl::handler& h) {\n') - src.write(' auto in = d_in.get_access(h);\n') - src.write(' auto out = d_out.get_access(h);\n') + src.write(' auto in = d_in.template get_access(h);\n') + src.write(' auto out = d_out.template get_access(h);\n') if (dim==2): for r in range(1,radius+1): src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 2f7e64253..bb94391ad 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -119,6 +119,10 @@ void run(cl::sycl::queue & q, int iterations, size_t length) std::cout << e.what() << std::endl; return; } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index e42eaef50..585fe62e9 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -65,10 +65,15 @@ #include "prk_util.h" #include "stencil_sycl.hpp" +template class init; +template class add; + #if USE_2D_INDEXING -void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) #else -void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +template +void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) #endif { std::cout << "You are trying to use a stencil that does not exist.\n"; @@ -77,74 +82,10 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_i std::abort(); } -int main(int argc, char* argv[]) +template +void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius) { - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl; - - ////////////////////////////////////////////////////////////////////// - // Process and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations; - size_t n, tile_size; - bool star = true; - size_t radius = 2; - try { - if (argc < 3) { - throw "Usage: <# iterations> [ ]"; - } - - // number of times to run the algorithm - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - // linear grid dimension - n = std::atoi(argv[2]); - if (n < 1) { - throw "ERROR: grid dimension must be positive"; - } else if (n > std::floor(std::sqrt(INT_MAX))) { - throw "ERROR: grid dimension too large - overflow risk"; - } - - // default tile size for tiling of local transpose - tile_size = 32; - if (argc > 3) { - tile_size = std::atoi(argv[3]); - if (tile_size <= 0) tile_size = n; - if (tile_size > n) tile_size = n; - } - - // stencil pattern - if (argc > 4) { - auto stencil = std::string(argv[4]); - auto grid = std::string("grid"); - star = (stencil == grid) ? false : true; - } - - // stencil radius - radius = 2; - if (argc > 5) { - radius = std::atoi(argv[5]); - } - - if ( (radius < 1) || (2*radius+1 > n) ) { - throw "ERROR: Stencil radius negative or too large"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Grid size = " << n << std::endl; - std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; - std::cout << "Radius of stencil = " << radius << std::endl; - - auto stencil = nothing; + auto stencil = nothing; if (star) { switch (radius) { case 1: stencil = star1; break; @@ -170,40 +111,39 @@ int main(int argc, char* argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto stencil_time = 0.0; + double stencil_time(0); - std::vector h_in(n*n,0.0); - std::vector h_out(n*n,0.0); + std::vector h_in(n*n,0); + std::vector h_out(n*n,0); + + try { - // SYCL device queue - cl::sycl::queue q; - { // initialize device buffers from host buffers #if USE_2D_INDEXING - cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; - cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; + cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; + cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; #else // FIXME: if I don't initialize this buffer from host, the results are wrong. Why? - //cl::sycl::buffer d_in { cl::sycl::range<1> {n*n} }; - cl::sycl::buffer d_in { h_in.data(), h_in.size() }; - cl::sycl::buffer d_out { h_out.data(), h_out.size() }; + //cl::sycl::buffer d_in { cl::sycl::range<1> {n*n} }; + cl::sycl::buffer d_in { h_in.data(), h_in.size() }; + cl::sycl::buffer d_out { h_out.data(), h_out.size() }; #endif q.submit([&](cl::sycl::handler& h) { // accessor methods - auto in = d_in.get_access(h); + auto in = d_in.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) { #if USE_2D_INDEXING cl::sycl::id<2> xy = it.get_id(); auto i = it[0]; auto j = it[1]; - in[xy] = static_cast(i+j); + in[xy] = static_cast(i+j); #else auto i = it[0]; auto j = it[1]; - in[i*n+j] = static_cast(i+j); + in[i*n+j] = static_cast(i+j); #endif }); }); @@ -214,27 +154,28 @@ int main(int argc, char* argv[]) if (iter==1) stencil_time = prk::wtime(); stencil(q, n, d_in, d_out); - // This is only necessary with triSYCL +#ifdef TRISYCL q.wait(); +#endif q.submit([&](cl::sycl::handler& h) { // accessor methods - auto in = d_in.get_access(h); + auto in = d_in.template get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, + h.parallel_for>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, [=] (cl::sycl::item<2> it) { #if USE_2D_INDEXING cl::sycl::id<2> xy = it.get_id(); - in[xy] += 1.0; + in[xy] += static_cast(1); #else #if 0 // This is noticeably slower :-( auto i = it[0]; auto j = it[1]; in[i*n+j] += 1.0; #else - in[it[0]*n+it[1]] += 1.0; + in[it[0]*n+it[1]] += static_cast(1); #endif #endif }); @@ -243,6 +184,14 @@ int main(int argc, char* argv[]) } stencil_time = prk::wtime() - stencil_time; } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + return; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } #if 0 for (auto i=0; i epsilon) { std::cout << "ERROR: L1 norm = " << norm << " Reference L1 norm = " << reference_norm << std::endl; - return 1; } else { std::cout << "Solution validates" << std::endl; #ifdef VERBOSE @@ -283,10 +231,137 @@ int main(int argc, char* argv[]) #endif const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); size_t flops = (2L*stencil_size+1L) * active_points; - auto avgtime = stencil_time/iterations; - std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + double avgtime = stencil_time/iterations; + std::cout << 8*sizeof(T) << "B " + << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime << " Avg time (s): " << avgtime << std::endl; } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t n, tile_size; + bool star = true; + size_t radius = 2; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + try { + + if (1) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + + run(host, iterations, n, tile_size, star, radius); + run(host, iterations, n, tile_size, star, radius); + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(cpu, iterations, n, tile_size, star, radius); + run(cpu, iterations, n, tile_size, star, radius); + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (0) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(gpu, iterations, n, tile_size, star, radius); + run(gpu, iterations, n, tile_size, star, radius); + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } return 0; } + + diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index c8a9d0a5b..587f2adc7 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -2,8 +2,8 @@ template void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.5) +in[it[0]*n+(it[1]-1)] * static_cast(-0.5) @@ -17,8 +17,8 @@ template void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { @@ -35,8 +35,8 @@ template void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.25) +in[it[0]*n+(it[1]-1)] * static_cast(-0.25) @@ -54,8 +54,8 @@ template void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); @@ -78,8 +78,8 @@ template void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.166666666667) +in[it[0]*n+(it[1]-1)] * static_cast(-0.166666666667) @@ -101,8 +101,8 @@ template void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); @@ -131,8 +131,8 @@ template void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.125) +in[it[0]*n+(it[1]-1)] * static_cast(-0.125) @@ -158,8 +158,8 @@ template void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); @@ -194,8 +194,8 @@ template void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.1) +in[it[0]*n+(it[1]-1)] * static_cast(-0.1) @@ -225,8 +225,8 @@ template void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { - auto in = d_in.get_access(h); - auto out = d_out.get_access(h); + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index d7b33e866..97bb8a09c 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -117,6 +117,10 @@ void run(cl::sycl::queue & q, int iterations, size_t order) std::cout << e.what() << std::endl; return; } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results From 01ad66d53f7453d5b484db337c1d298714d51c0f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Mar 2019 11:34:55 -0800 Subject: [PATCH 127/245] try to detect working configs better --- Cxx11/nstream-sycl.cc | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index bb94391ad..bebfb5932 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -225,26 +225,39 @@ int main(int argc, char * argv[]) std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - - run(cpu, iterations, length); - run(cpu, iterations, length); + if (has_spir) { + run(cpu, iterations, length); + run(cpu, iterations, length); + } } // NVIDIA GPU requires ptx64 target and does not work very well - if (0) { + if (1) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); #ifndef TRISYCL auto device = gpu.get_device(); std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - - run(gpu, iterations, length); - run(gpu, iterations, length); + if (has_spir) { + run(gpu, iterations, length); + run(gpu, iterations, length); + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, length); + run(gpu, iterations, length); +#endif + } } } catch (cl::sycl::exception e) { From 548fe7aecfb259197db4491fc77b07b6c1cc0749 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Mar 2019 11:48:27 -0800 Subject: [PATCH 128/245] forward-declare kernel names in SYCL stencil --- Cxx11/generate-sycl-stencil.py | 7 ++- Cxx11/stencil_sycl.hpp | 111 ++++++++++++++++++--------------- 2 files changed, 67 insertions(+), 51 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index e1857e8c2..f0feb9e3f 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -22,7 +22,7 @@ def codegen(src,pattern,stencil_size,radius,model,dim): for r in range(1,radius+1): src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') - src.write(' h.parallel_for(') + src.write(' h.parallel_for>(') src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') src.write('[=] (cl::sycl::item<2> it) {\n') @@ -91,6 +91,11 @@ def instance(src,model,pattern,r): def main(): for model in ['sycl']: src = open('stencil_'+model+'.hpp','w') + for pattern in ['star']: + for r in range(1,6): + src.write('template class '+pattern+str(r)+'_1d;\n') + src.write('template class '+pattern+str(r)+'_2d;\n') + src.write('\n') #for pattern in ['star','grid']: for pattern in ['star']: for r in range(1,6): diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 587f2adc7..799c86573 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,10 +1,21 @@ +template class star1_1d; +template class star1_2d; +template class star2_1d; +template class star2_2d; +template class star3_1d; +template class star3_2d; +template class star4_1d; +template class star4_2d; +template class star5_1d; +template class star5_2d; + template void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { q.submit([&](cl::sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.5) +in[it[0]*n+(it[1]-1)] * static_cast(-0.5) +in[(it[0]+1)*n+it[1]] * static_cast(0.5) @@ -21,7 +32,7 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c auto out = d_out.template get_access(h); cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - h.parallel_for(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.5) +in[xy-dx1] * static_cast(-0.5) @@ -37,7 +48,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: q.submit([&](cl::sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.25) +in[it[0]*n+(it[1]-1)] * static_cast(-0.25) +in[(it[0]+1)*n+it[1]] * static_cast(0.25) @@ -60,7 +71,7 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - h.parallel_for(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.25) +in[xy-dx1] * static_cast(-0.25) @@ -80,19 +91,19 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: q.submit([&](cl::sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.166666666667) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.166666666667) - +in[(it[0]+1)*n+it[1]] * static_cast(0.166666666667) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.166666666667) - +in[it[0]*n+(it[1]+2)] * static_cast(0.0833333333333) - +in[it[0]*n+(it[1]-2)] * static_cast(-0.0833333333333) - +in[(it[0]+2)*n+it[1]] * static_cast(0.0833333333333) - +in[(it[0]-2)*n+it[1]] * static_cast(-0.0833333333333) - +in[it[0]*n+(it[1]+3)] * static_cast(0.0555555555556) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.0555555555556) - +in[(it[0]+3)*n+it[1]] * static_cast(0.0555555555556) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.0555555555556); + h.parallel_for>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { + out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.16666666666666666) + +in[it[0]*n+(it[1]-1)] * static_cast(-0.16666666666666666) + +in[(it[0]+1)*n+it[1]] * static_cast(0.16666666666666666) + +in[(it[0]-1)*n+it[1]] * static_cast(-0.16666666666666666) + +in[it[0]*n+(it[1]+2)] * static_cast(0.08333333333333333) + +in[it[0]*n+(it[1]-2)] * static_cast(-0.08333333333333333) + +in[(it[0]+2)*n+it[1]] * static_cast(0.08333333333333333) + +in[(it[0]-2)*n+it[1]] * static_cast(-0.08333333333333333) + +in[it[0]*n+(it[1]+3)] * static_cast(0.05555555555555555) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.05555555555555555) + +in[(it[0]+3)*n+it[1]] * static_cast(0.05555555555555555) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.05555555555555555); }); }); } @@ -109,20 +120,20 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - h.parallel_for(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * static_cast(0.166666666667) - +in[xy-dx1] * static_cast(-0.166666666667) - +in[xy+dy1] * static_cast(0.166666666667) - +in[xy-dy1] * static_cast(-0.166666666667) - +in[xy+dx2] * static_cast(0.0833333333333) - +in[xy-dx2] * static_cast(-0.0833333333333) - +in[xy+dy2] * static_cast(0.0833333333333) - +in[xy-dy2] * static_cast(-0.0833333333333) - +in[xy+dx3] * static_cast(0.0555555555556) - +in[xy-dx3] * static_cast(-0.0555555555556) - +in[xy+dy3] * static_cast(0.0555555555556) - +in[xy-dy3] * static_cast(-0.0555555555556); + out[xy] += +in[xy+dx1] * static_cast(0.16666666666666666) + +in[xy-dx1] * static_cast(-0.16666666666666666) + +in[xy+dy1] * static_cast(0.16666666666666666) + +in[xy-dy1] * static_cast(-0.16666666666666666) + +in[xy+dx2] * static_cast(0.08333333333333333) + +in[xy-dx2] * static_cast(-0.08333333333333333) + +in[xy+dy2] * static_cast(0.08333333333333333) + +in[xy-dy2] * static_cast(-0.08333333333333333) + +in[xy+dx3] * static_cast(0.05555555555555555) + +in[xy-dx3] * static_cast(-0.05555555555555555) + +in[xy+dy3] * static_cast(0.05555555555555555) + +in[xy-dy3] * static_cast(-0.05555555555555555); }); }); } @@ -133,7 +144,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: q.submit([&](cl::sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.125) +in[it[0]*n+(it[1]-1)] * static_cast(-0.125) +in[(it[0]+1)*n+it[1]] * static_cast(0.125) @@ -142,10 +153,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: +in[it[0]*n+(it[1]-2)] * static_cast(-0.0625) +in[(it[0]+2)*n+it[1]] * static_cast(0.0625) +in[(it[0]-2)*n+it[1]] * static_cast(-0.0625) - +in[it[0]*n+(it[1]+3)] * static_cast(0.0416666666667) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.0416666666667) - +in[(it[0]+3)*n+it[1]] * static_cast(0.0416666666667) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.0416666666667) + +in[it[0]*n+(it[1]+3)] * static_cast(0.041666666666666664) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.041666666666666664) + +in[(it[0]+3)*n+it[1]] * static_cast(0.041666666666666664) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.041666666666666664) +in[it[0]*n+(it[1]+4)] * static_cast(0.03125) +in[it[0]*n+(it[1]-4)] * static_cast(-0.03125) +in[(it[0]+4)*n+it[1]] * static_cast(0.03125) @@ -168,7 +179,7 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); - h.parallel_for(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.125) +in[xy-dx1] * static_cast(-0.125) @@ -178,10 +189,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c +in[xy-dx2] * static_cast(-0.0625) +in[xy+dy2] * static_cast(0.0625) +in[xy-dy2] * static_cast(-0.0625) - +in[xy+dx3] * static_cast(0.0416666666667) - +in[xy-dx3] * static_cast(-0.0416666666667) - +in[xy+dy3] * static_cast(0.0416666666667) - +in[xy-dy3] * static_cast(-0.0416666666667) + +in[xy+dx3] * static_cast(0.041666666666666664) + +in[xy-dx3] * static_cast(-0.041666666666666664) + +in[xy+dy3] * static_cast(0.041666666666666664) + +in[xy-dy3] * static_cast(-0.041666666666666664) +in[xy+dx4] * static_cast(0.03125) +in[xy-dx4] * static_cast(-0.03125) +in[xy+dy4] * static_cast(0.03125) @@ -196,7 +207,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: q.submit([&](cl::sycl::handler& h) { auto in = d_in.template get_access(h); auto out = d_out.template get_access(h); - h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.1) +in[it[0]*n+(it[1]-1)] * static_cast(-0.1) +in[(it[0]+1)*n+it[1]] * static_cast(0.1) @@ -205,10 +216,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: +in[it[0]*n+(it[1]-2)] * static_cast(-0.05) +in[(it[0]+2)*n+it[1]] * static_cast(0.05) +in[(it[0]-2)*n+it[1]] * static_cast(-0.05) - +in[it[0]*n+(it[1]+3)] * static_cast(0.0333333333333) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.0333333333333) - +in[(it[0]+3)*n+it[1]] * static_cast(0.0333333333333) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.0333333333333) + +in[it[0]*n+(it[1]+3)] * static_cast(0.03333333333333333) + +in[it[0]*n+(it[1]-3)] * static_cast(-0.03333333333333333) + +in[(it[0]+3)*n+it[1]] * static_cast(0.03333333333333333) + +in[(it[0]-3)*n+it[1]] * static_cast(-0.03333333333333333) +in[it[0]*n+(it[1]+4)] * static_cast(0.025) +in[it[0]*n+(it[1]-4)] * static_cast(-0.025) +in[(it[0]+4)*n+it[1]] * static_cast(0.025) @@ -237,7 +248,7 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); - h.parallel_for(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { cl::sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.1) +in[xy-dx1] * static_cast(-0.1) @@ -247,10 +258,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c +in[xy-dx2] * static_cast(-0.05) +in[xy+dy2] * static_cast(0.05) +in[xy-dy2] * static_cast(-0.05) - +in[xy+dx3] * static_cast(0.0333333333333) - +in[xy-dx3] * static_cast(-0.0333333333333) - +in[xy+dy3] * static_cast(0.0333333333333) - +in[xy-dy3] * static_cast(-0.0333333333333) + +in[xy+dx3] * static_cast(0.03333333333333333) + +in[xy-dx3] * static_cast(-0.03333333333333333) + +in[xy+dy3] * static_cast(0.03333333333333333) + +in[xy-dy3] * static_cast(-0.03333333333333333) +in[xy+dx4] * static_cast(0.025) +in[xy-dx4] * static_cast(-0.025) +in[xy+dy4] * static_cast(0.025) From 2d7e4e8b251bc2522af53902c34b6f79c53e42a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Mar 2019 12:42:26 -0800 Subject: [PATCH 129/245] fix float template for 2D case --- Cxx11/transpose-sycl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 97bb8a09c..e7b1d94d2 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -75,8 +75,8 @@ void run(cl::sycl::queue & q, int iterations, size_t order) try { #if USE_2D_INDEXING - cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); - cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); + cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); + cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); #else cl::sycl::buffer d_A { h_A.data(), h_A.size() }; cl::sycl::buffer d_B { h_B.data(), h_B.size() }; From b2c63f6e132f6ff114a53fae4d816590ad879c97 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 4 Mar 2019 12:43:02 -0800 Subject: [PATCH 130/245] declare kernel name templates closer to usage --- Cxx11/generate-sycl-stencil.py | 7 ++----- Cxx11/stencil_sycl.hpp | 37 +++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index f0feb9e3f..d88cae37b 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -6,6 +6,8 @@ import os def codegen(src,pattern,stencil_size,radius,model,dim): + src.write('// declare the kernel name used in SYCL parallel_for\n') + src.write('template class '+pattern+str(radius)+'_'+str(dim)+'d;\n\n') src.write('template \n') src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ') if (dim==2): @@ -91,11 +93,6 @@ def instance(src,model,pattern,r): def main(): for model in ['sycl']: src = open('stencil_'+model+'.hpp','w') - for pattern in ['star']: - for r in range(1,6): - src.write('template class '+pattern+str(r)+'_1d;\n') - src.write('template class '+pattern+str(r)+'_2d;\n') - src.write('\n') #for pattern in ['star','grid']: for pattern in ['star']: for r in range(1,6): diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 799c86573..41412e5b4 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -1,13 +1,5 @@ +// declare the kernel name used in SYCL parallel_for template class star1_1d; -template class star1_2d; -template class star2_1d; -template class star2_2d; -template class star3_1d; -template class star3_2d; -template class star4_1d; -template class star4_2d; -template class star5_1d; -template class star5_2d; template void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) @@ -24,6 +16,9 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: }); } +// declare the kernel name used in SYCL parallel_for +template class star1_2d; + template void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -42,6 +37,9 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star2_1d; + template void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -61,6 +59,9 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: }); } +// declare the kernel name used in SYCL parallel_for +template class star2_2d; + template void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -85,6 +86,9 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star3_1d; + template void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -108,6 +112,9 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: }); } +// declare the kernel name used in SYCL parallel_for +template class star3_2d; + template void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -138,6 +145,9 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star4_1d; + template void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -165,6 +175,9 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: }); } +// declare the kernel name used in SYCL parallel_for +template class star4_2d; + template void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -201,6 +214,9 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star5_1d; + template void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { @@ -232,6 +248,9 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: }); } +// declare the kernel name used in SYCL parallel_for +template class star5_2d; + template void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) { From 7eeec67c9f978608197a292d19361311a9c5bf9d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 12 Feb 2019 17:25:08 -0800 Subject: [PATCH 131/245] do not incorrectly declare non-read-only buffers as read-only --- Cxx11/nstream-opencl.cc | 2 +- Cxx11/p2p-innerloop-opencl.cc | 2 +- Cxx11/stencil-opencl.cc | 2 +- Cxx11/transpose-opencl.cc | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 9d81f1b8d..40b76d4cc 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -93,7 +93,7 @@ void run(cl::Context context, int iterations, size_t length) std::vector h_c(length, T(2)); // copy input from host to device - cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true); + cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), false); cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), true); cl::Buffer d_c = cl::Buffer(context, begin(h_c), end(h_c), true); diff --git a/Cxx11/p2p-innerloop-opencl.cc b/Cxx11/p2p-innerloop-opencl.cc index 2552fe787..43bbefa28 100644 --- a/Cxx11/p2p-innerloop-opencl.cc +++ b/Cxx11/p2p-innerloop-opencl.cc @@ -93,7 +93,7 @@ void run(cl::Context context, int iterations, int n) } // copy input from host to device - cl::Buffer d_grid = cl::Buffer(context, begin(h_grid), end(h_grid), true); + cl::Buffer d_grid = cl::Buffer(context, begin(h_grid), end(h_grid), false); auto pipeline_time = 0.0; diff --git a/Cxx11/stencil-opencl.cc b/Cxx11/stencil-opencl.cc index 8db6adfa4..0b82a5e79 100644 --- a/Cxx11/stencil-opencl.cc +++ b/Cxx11/stencil-opencl.cc @@ -124,7 +124,7 @@ void run(cl::Context context, int iterations, int n, int radius, bool star) // copy input from host to device cl::Buffer d_in = cl::Buffer(context, begin(h_in), end(h_in), true); - cl::Buffer d_out = cl::Buffer(context, begin(h_out), end(h_out), true); + cl::Buffer d_out = cl::Buffer(context, begin(h_out), end(h_out), false); for (auto iter = 0; iter<=iterations; iter++) { diff --git a/Cxx11/transpose-opencl.cc b/Cxx11/transpose-opencl.cc index 4e22114d5..dc1186ff2 100644 --- a/Cxx11/transpose-opencl.cc +++ b/Cxx11/transpose-opencl.cc @@ -85,8 +85,8 @@ void run(cl::Context context, int iterations, int order) std::iota(h_a.begin(), h_a.end(), (T)0); // copy input from host to device - cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), true); - cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), true); + cl::Buffer d_a = cl::Buffer(context, begin(h_a), end(h_a), false); + cl::Buffer d_b = cl::Buffer(context, begin(h_b), end(h_b), false); auto trans_time = 0.0; From 725d5eebeef8a2c4fa9da17a5a5a57407c6d652d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 15 Mar 2019 15:14:15 -0700 Subject: [PATCH 132/245] remove Rust from parent makefile to unbreak case when cargo missing --- Makefile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index e3be66f23..99d7ee2ae 100644 --- a/Makefile +++ b/Makefile @@ -72,14 +72,14 @@ help: @echo " \"make allfreaks\" (re-)builds the above four targets" @echo " optionally, specify \"matrix_rank= number_of_functions=\"" @echo " optionally, specify \"default_opt_flags=\"" - @echo " \"make allshared\" (re-)builds the shared-memory targets (C89, C1z, C++11, Fortran, RUST)" + @echo " \"make allshared\" (re-)builds the shared-memory targets (C89, C1z, C++11, Fortran)" @echo " \"make clean\" removes all objects and executables" @echo " \"make veryclean\" removes some generated source files as well" all: alldarwin allfreaks allshared alldarwin: allserial allopenmp allmpi1 allfgmpi allmpiopenmp allmpirma allshmem allmpishm allupc allfortran allfenix allfreaks: allcharm++ allampi allgrappa alllegion -allshared: allserial allopenmp allfortran allcxx allc1z allrust +allshared: allserial allopenmp allfortran allcxx allc1z allnew: allfortran allcxx allc1z allmpi1: @@ -332,7 +332,6 @@ clean: make -C FORTRAN clean make -C Cxx11 clean make -C C1z clean - make -C RUST clean rm -f stats.json veryclean: clean From 30a2c6f6b5c27f19f3f604ee9e85985fc83db334 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 15 Mar 2019 15:58:15 -0700 Subject: [PATCH 133/245] avoid overflow --- AMPI/Stencil/stencil.c | 2 +- CHARM++/Stencil/stencil.C | 2 +- FG_MPI/Stencil/stencil.c | 2 +- MPI1/Stencil/stencil.c | 2 +- MPIOPENMP/Stencil/stencil.c | 2 +- MPIRMA/Stencil/stencil.c | 2 +- MPISHM/Stencil/stencil.c | 2 +- SHMEM/Stencil/stencil.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/AMPI/Stencil/stencil.c b/AMPI/Stencil/stencil.c index 228853b24..6d79c5abe 100644 --- a/AMPI/Stencil/stencil.c +++ b/AMPI/Stencil/stencil.c @@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/CHARM++/Stencil/stencil.C b/CHARM++/Stencil/stencil.C index 273198b3f..2dee692af 100644 --- a/CHARM++/Stencil/stencil.C +++ b/CHARM++/Stencil/stencil.C @@ -4,7 +4,7 @@ #define EPSILON 1.e-8 #define COEFX 1.0 #define COEFY 1.0 -#define INDEXIN(i,j) (i+RADIUS+(width+2*RADIUS)*(j+RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(width+2*RADIUS)*(long)(j+RADIUS)) #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+width*(j)) #define OUT(i,j) out[INDEXOUT(i-istart,j-jstart)] diff --git a/FG_MPI/Stencil/stencil.c b/FG_MPI/Stencil/stencil.c index bee84ae1e..f346264a3 100644 --- a/FG_MPI/Stencil/stencil.c +++ b/FG_MPI/Stencil/stencil.c @@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/MPI1/Stencil/stencil.c b/MPI1/Stencil/stencil.c index 2f33fe5e9..2417b1e00 100644 --- a/MPI1/Stencil/stencil.c +++ b/MPI1/Stencil/stencil.c @@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/MPIOPENMP/Stencil/stencil.c b/MPIOPENMP/Stencil/stencil.c index 54b614d52..a80200030 100644 --- a/MPIOPENMP/Stencil/stencil.c +++ b/MPIOPENMP/Stencil/stencil.c @@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/MPIRMA/Stencil/stencil.c b/MPIRMA/Stencil/stencil.c index a2c320af2..49d63e85e 100644 --- a/MPIRMA/Stencil/stencil.c +++ b/MPIRMA/Stencil/stencil.c @@ -88,7 +88,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/MPISHM/Stencil/stencil.c b/MPISHM/Stencil/stencil.c index 5f2b92242..a7593680c 100644 --- a/MPISHM/Stencil/stencil.c +++ b/MPISHM/Stencil/stencil.c @@ -148,7 +148,7 @@ HISTORY: - Written by Rob Van der Wijngaart, November 2006. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width)) diff --git a/SHMEM/Stencil/stencil.c b/SHMEM/Stencil/stencil.c index 400e4b1f3..4e2b35376 100644 --- a/SHMEM/Stencil/stencil.c +++ b/SHMEM/Stencil/stencil.c @@ -77,7 +77,7 @@ HISTORY: - Written by Tom St. John, July 2015. #endif /* define shorthand for indexing multi-dimensional arrays with offsets */ -#define INDEXIN(i,j) (i+RADIUS+(j+RADIUS)*(width[0]+2*RADIUS)) +#define INDEXIN(i,j) (i+RADIUS+(long)(j+RADIUS)*(long)(width[0]+2*RADIUS)) /* need to add offset of RADIUS to j to account for ghost points */ #define IN(i,j) in[INDEXIN(i-istart,j-jstart)] #define INDEXOUT(i,j) (i+(j)*(width[0])) From d4ca82f39d0fcd177c9a8dd14b88e23aace8a1b7 Mon Sep 17 00:00:00 2001 From: Christian Trott Date: Thu, 4 Apr 2019 10:49:23 -0700 Subject: [PATCH 134/245] Cxx11 nstream-kokkos: add missing fences There are fences missing hence you wont' measure what you think on asynchronous backends such as CUDA or HPX. This also fixes using the actual name of the exec space instead of typeid. Example for CUDA on V100: Original: Parallel Research Kernels version 2.16 C++11/Kokkos STREAM triad: A = B + scalar * C Number of iterations = 1 Vector length = 100000000 Offset = 0 Kokkos execution space: N6Kokkos4CudaE Solution validates Rate (MB/s): 422188 Avg time (s): 0.00757957 With fences (and name fix): Parallel Research Kernels version 2.16 C++11/Kokkos STREAM triad: A = B + scalar * C Number of iterations = 1 Vector length = 100000000 Offset = 0 Kokkos execution space: Cuda Solution validates Rate (MB/s): 842600 Avg time (s): 0.00379777 --- Cxx11/nstream-kokkos.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index d03a47207..9e0af56bd 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -118,7 +118,7 @@ int main(int argc, char * argv[]) std::cout << "Number of iterations = " << iterations << std::endl; std::cout << "Vector length = " << length << std::endl; std::cout << "Offset = " << offset << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -138,15 +138,19 @@ int main(int argc, char * argv[]) B[i] = 2.0; C[i] = 2.0; }); - + Kokkos::fence(); for (int iter = 0; iter<=iterations; ++iter) { - if (iter==1) nstream_time = prk::wtime(); + if (iter==1) { + Kokkos::fence(); + nstream_time = prk::wtime(); + } Kokkos::parallel_for(length, KOKKOS_LAMBDA(size_t const i) { A[i] += B[i] + scalar * C[i]; }); } + Kokkos::fence(); nstream_time = prk::wtime() - nstream_time; } From ac5a44bdaa52a066097f47bad22f4648f06aed64 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 7 Apr 2019 20:32:08 -0700 Subject: [PATCH 135/245] fix how BLAS linked in Fortran --- FORTRAN/Makefile | 9 +++++---- common/make.defs.gcc | 4 +++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 4d479881c..551c8fc36 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -14,9 +14,7 @@ FCFLAGS = $(DEFAULT_OPT_FLAGS) FCFLAGS += -DRADIUS=$(RADIUS) $(STARFLAG) ifeq ($(findstring ifort,$(FC)),ifort) - BLASFLAGS = $(CBLASFLAG) -heap-arrays -else - BLASFLAGS = $(CBLASFLAG) + BLASFLAGS += -heap-arrays endif .PHONY: all clean serial pretty openmp coarray target ornlacc @@ -68,9 +66,12 @@ stencil: stencil.f90 stencil_serial.f90 #$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o $(FC) $(FCFLAGS) $< -o $@ -%-pretty: %-pretty.f90 +dgemm-pretty: dgemm-pretty.f90 $(FC) $(FCFLAGS) $(BLASFLAGS) $< -o $@ +%-pretty: %-pretty.f90 + $(FC) $(FCFLAGS) $< -o $@ + %-openmp: %.f90 $(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 5f3f62f03..89a0a1b0e 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -92,15 +92,17 @@ THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} -SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} +SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx #SYCLCXX=${CXX} ${OPENMPFLAG} #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +SYCLFLAG+=${RANGEFLAG} # # CBLAS for C++ DGEMM # +BLASFLAG=-DACCELERATE -framework Accelerate CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # # CUDA flags From 432ac49bd2ba28c01e90909613dc45d633cf4a2f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 7 Apr 2019 20:34:50 -0700 Subject: [PATCH 136/245] range-based TBB parallel_for --- Cxx11/nstream-vector-tbb.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc index 0fbc777c2..0859e0654 100644 --- a/Cxx11/nstream-vector-tbb.cc +++ b/Cxx11/nstream-vector-tbb.cc @@ -126,6 +126,7 @@ int main(int argc, char * argv[]) tbb::blocked_range range(0, length); { +#if 0 tbb::parallel_for( range, [&](decltype(range)& r) { for (auto i=r.begin(); i!=r.end(); ++i ) { A[i] = 0.0; @@ -133,16 +134,29 @@ int main(int argc, char * argv[]) C[i] = 2.0; } }, tbb_partitioner); +#else + tbb::parallel_for( std::begin(range), std::end(range), [&](size_t i) { + A[i] = 0.0; + B[i] = 2.0; + C[i] = 2.0; + }, tbb_partitioner); +#endif for (auto iter = 0; iter<=iterations; iter++) { if (iter==1) nstream_time = prk::wtime(); +#if 0 tbb::parallel_for( range, [&](decltype(range)& r) { for (auto i=r.begin(); i!=r.end(); ++i ) { A[i] += B[i] + scalar * C[i]; } }, tbb_partitioner); +#else + tbb::parallel_for( std::begin(range), std::end(range), [&](size_t i) { + A[i] += B[i] + scalar * C[i]; + }, tbb_partitioner); +#endif } nstream_time = prk::wtime() - nstream_time; } From 06c3bffb7cf1fcd783f1db39ee5c0cfc5de31a9f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 7 Apr 2019 20:35:35 -0700 Subject: [PATCH 137/245] show but do not enable non-range-based for in RAJA --- Cxx11/nstream-raja.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc index c98dae978..ef7b6c08e 100644 --- a/Cxx11/nstream-raja.cc +++ b/Cxx11/nstream-raja.cc @@ -133,6 +133,7 @@ int main(int argc, char * argv[]) double scalar(3); { + //RAJA::forall(0, length, [=](RAJA::Index_type i) { RAJA::forall(range, [=](RAJA::Index_type i) { A(i) = 0.0; B(i) = 2.0; @@ -143,6 +144,7 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); + //RAJA::forall(0, length, [=](RAJA::Index_type i) { RAJA::forall(range, [=](RAJA::Index_type i) { A(i) += B(i) + scalar * C(i); }); From 7f0e0ff264c994ea070230c188bd36f4c1198e5e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 17:45:09 -0500 Subject: [PATCH 138/245] fix issues with Thrust when not using NVCC --- Cxx11/nstream-host-thrust.cc | 2 -- Cxx11/prk_thrust.h | 14 +++++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc index c06c89108..ac82f33d3 100644 --- a/Cxx11/nstream-host-thrust.cc +++ b/Cxx11/nstream-host-thrust.cc @@ -115,8 +115,6 @@ int main(int argc, char * argv[]) thrust::host_vector B(length); thrust::host_vector C(length); - auto range = prk::range(static_cast(0), length); - double scalar(3); { thrust::fill(thrust::host, A.begin(), A.end(), 0.0); diff --git a/Cxx11/prk_thrust.h b/Cxx11/prk_thrust.h index 4ffd50c34..1d733bf67 100644 --- a/Cxx11/prk_thrust.h +++ b/Cxx11/prk_thrust.h @@ -35,16 +35,24 @@ #ifdef USE_THRUST # ifdef __NVCC__ # include +# elif defined(_OPENMP) +#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_OMP +# include +//# include +#else +#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CPP +# include +//# include # endif # include # include -# include -# include # include # include +# include +# include +# include # include # include -# include #endif #endif /* PRK_THRUST_H */ From 0c920eeb627402d90ee243bca2fb8222a090100e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 17:52:37 -0500 Subject: [PATCH 139/245] update examples for Thrust changes --- common/make.defs.cuda | 66 ++++++++++++++++++++++++++++++++++++++++-- common/make.defs.gcc | 15 ++++++---- common/make.defs.intel | 4 +-- common/make.defs.llvm | 5 ++-- 4 files changed, 77 insertions(+), 13 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 48b85710d..0f5fafb75 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -6,24 +6,37 @@ # VERSION=-7 # C99 is required in some implementations. -CC=gcc${VERSION} -std=gnu99 +CC=gcc${VERSION} -std=gnu11 +#EXTRA_CLIBS=-lrt # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=gfortran${VERSION} -std=f2008 -cpp # C++11 may not be required but does no harm here. -CXX=g++${VERSION} -std=gnu++11 +CXX=g++${VERSION} -std=gnu++17 # # Compiler flags # # -mtune=native is appropriate for most cases. # -march=native is appropriate if you want portable binaries. -DEFAULT_OPT_FLAGS=-g -O3 -mtune=native +#DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math +DEFAULT_OPT_FLAGS=-O0 +DEFAULT_OPT_FLAGS+=-g3 +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak +#DEFAULT_OPT_FLAGS+=-fsanitize=address +#DEFAULT_OPT_FLAGS+=-fsanitize=thread # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -march=knl # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. # +#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall #-Werror +DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-mavx -mfma +# # OpenMP flags # OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd OFFLOADFLAG=-foffload="-O3 -v" ORNLACCFLAG=-fopenacc # @@ -35,6 +48,53 @@ ORNLACCFLAG=-fopenacc #OPENCLDIR=/etc/alternatives/opencl-intel-tools #OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL OPENCLFLAG=-I/usr/include -L/usr/lib/x86_64-linux-gnu -lOpenCL +OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# Cilk +# +#CILKFLAG=-fcilkplus +# +# TBB +# +TBBDIR=/usr/local/Cellar/tbb/2019_U5_1 +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +# +# Parallel STL, Boost, etc. +# +BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +KOKKOSDIR=/opt/kokkos/gcc +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +RAJADIR=/opt/raja/gcc +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} +SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL +# ProGTX +# https://github.com/ProGTX/sycl-gtx +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +SYCLFLAG+=${RANGEFLAG} +# +# CBLAS for C++ DGEMM +# +BLASFLAG=-DACCELERATE -framework Accelerate +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # # CUDA flags # diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 89a0a1b0e..50b7a572a 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -17,10 +17,13 @@ CXX=g++${VERSION} -std=gnu++17 -pthread # # -mtune=native is appropriate for most cases. # -march=native is appropriate if you want portable binaries. -DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=undefined,leak -#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=address -#DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math -fsanitize=thread +DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math +#DEFAULT_OPT_FLAGS=-O0 +DEFAULT_OPT_FLAGS+=-g3 +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak +#DEFAULT_OPT_FLAGS+=-fsanitize=address +#DEFAULT_OPT_FLAGS+=-fsanitize=thread # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -march=knl # See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. @@ -84,7 +87,7 @@ KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust -THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # SYCL flags # @@ -97,7 +100,7 @@ SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx #SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} SYCLFLAG+=${RANGEFLAG} # # CBLAS for C++ DGEMM diff --git a/common/make.defs.intel b/common/make.defs.intel index 17a4c2833..664d79e0c 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -76,8 +76,8 @@ KOKKOSDIR=/opt/kokkos/intel KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/intel RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -#THRUSTDIR=/opt/nvidia/thrust -#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # CBLAS for C++ DGEMM # diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 4929aa0bb..fda35f476 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -105,11 +105,12 @@ KOKKOSDIR=/opt/kokkos/clang KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl RAJADIR=/opt/raja/clang RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} -#THRUSTDIR=/opt/nvidia/thrust -#THRUSTFLAG=-I${THRUSTDIR} -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # CBLAS for C++ DGEMM # +BLASFLAG=-DACCELERATE -framework Accelerate CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions # # CUDA flags From 27fff516c43653facef1ff3a12643ee19220c3d6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 17:53:10 -0500 Subject: [PATCH 140/245] switch Thrust to use PRK range wrapper --- Cxx11/transpose-device-thrust.cu | 2 +- Cxx11/transpose-host-thrust.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu index b4c9a1874..907f45e94 100644 --- a/Cxx11/transpose-device-thrust.cu +++ b/Cxx11/transpose-device-thrust.cu @@ -128,7 +128,7 @@ int main(int argc, char * argv[]) thrust::sequence(thrust::device, A.begin(), A.end() ); thrust::fill(thrust::device, B.begin(), B.end(), 0.0); - auto range = boost::irange(0,order); + auto range = prk::range(0,order); auto trans_time = 0.0; diff --git a/Cxx11/transpose-host-thrust.cc b/Cxx11/transpose-host-thrust.cc index 11482700a..07065b7e8 100644 --- a/Cxx11/transpose-host-thrust.cc +++ b/Cxx11/transpose-host-thrust.cc @@ -100,7 +100,7 @@ int main(int argc, char * argv[]) thrust::sequence(thrust::host, A.begin(), A.end() ); thrust::fill(thrust::host, B.begin(), B.end(), 0.0); - auto range = boost::irange(0,order); + auto range = prk::range(0,order); auto trans_time = 0.0; From 4447d97cff7f75a543e78ea517209f6ea4b84eef Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 19 Apr 2019 16:50:46 -0700 Subject: [PATCH 141/245] not yet working prk::vector --- Cxx11/Makefile | 2 +- Cxx11/prk_util.h | 107 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 65875fba4..f71b2a568 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -81,7 +81,7 @@ transpose: transpose-valarray transpose-vector transpose-vector-async transpose- transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \ transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl -nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \ +#nstream: nstream-valarray nstream-vector nstream-openmp nstream-openmp-target \ nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \ nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index b25dccdf6..0062ba66d 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -84,6 +84,85 @@ namespace prk { + int get_alignment(void) + { + /* a := alignment */ +#ifdef PRK_ALIGNMENT + int a = PRK_ALIGNMENT; +#else + char* temp = getenv("PRK_ALIGNMENT"); + int a = (temp!=NULL) ? atoi(temp) : 64; + if (a < 8) a = 8; + assert( (a & (~a+1)) == a ); /* is power of 2? */ +#endif + return a; + } + +#if defined(__INTEL_COMPILER) + + template + T * malloc(size_t bytes) + { + const int alignment = prk::get_alignment(); + return (T*)_mm_malloc( bytes, alignment); + } + + template + void free(T * p) + { + _mm_free(p); + } + +#else // !__INTEL_COMPILER + + template + void * malloc(size_t bytes) + { + const int alignment = prk_get_alignment(); + + // We cannot use C11 aligned_alloc on Mac. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69680 */ + // GCC claims to be C11 without knowing if glibc is compliant... +#if !defined(__GNUC__) && \ + !defined(__APPLE__) && \ + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) + + // From ISO C11: + // + // "The aligned_alloc function allocates space for an object + // whose alignment is specified by alignment, whose size is + // specified by size, and whose value is indeterminate. + // The value of alignment shall be a valid alignment supported + // by the implementation and the value of size shall be an + // integral multiple of alignment." + // + // Thus, if we do not round up the bytes to be a multiple + // of the alignment, we violate ISO C. + + size_t padded = bytes; + size_t excess = bytes % alignment; + if (excess>0) padded += (alignment - excess); + return aligned_alloc(alignment,padded); + +#else + + T * ptr = NULL; + int ret = posix_memalign(&ptr,alignment,bytes); + if (ret!=0) ptr = NULL; + return ptr; + +#endif + + } + + template + void free(void * p) + { + free(p); + } + +#endif // __INTEL_COMPILER + template const T reduce(I first, I last, T init) { #if (defined(__cplusplus) && (__cplusplus >= 201703L)) && !defined(__GNUC__) @@ -100,6 +179,34 @@ namespace prk { #endif } + template + class vector { + + private: + T * data_; + size_t size_; + + public: + + vector(size_t n) { + this->data_ = prk::malloc(n); + } + + vector(size_t n, T v) { + this->data_ = prk::malloc(n); + for (size_t i=0; idata_[i] = v; + } + + ~vector() { + prk::free(this->data_); + } + + T & operator[] (size_t n) { + return this->data_[n]; + } + + }; + static inline double wtime(void) { #if defined(USE_OPENMP) && defined(_OPENMP) From 7bde7d95257de6950e0254e78009155a2b6cf6b5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 14:44:52 -0700 Subject: [PATCH 142/245] fix prk::vector --- Cxx11/prk_util.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 0062ba66d..889e7e87d 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -101,9 +101,10 @@ namespace prk { #if defined(__INTEL_COMPILER) template - T * malloc(size_t bytes) + T * malloc(size_t n) { const int alignment = prk::get_alignment(); + const size_t bytes = n * sizeof(T); return (T*)_mm_malloc( bytes, alignment); } @@ -189,15 +190,18 @@ namespace prk { public: vector(size_t n) { + //this->data_ = new T[n]; this->data_ = prk::malloc(n); } vector(size_t n, T v) { + //this->data_ = new T[n]; this->data_ = prk::malloc(n); for (size_t i=0; idata_[i] = v; } ~vector() { + //delete[] this->data_; prk::free(this->data_); } From 8f344f76b878f59abe3d571606ba22144694ac40 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 14:45:00 -0700 Subject: [PATCH 143/245] eliminate rule conflict --- Cxx11/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index f71b2a568..3953c4d80 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -68,16 +68,16 @@ endif all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) -p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ +#p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ p2p-hyperplane-sycl p2p-hyperplane-vector-ornlacc p2p-tasks-tbb -stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \ +#stencil: stencil-valarray stencil-vector stencil-vector-async stencil-openmp stencil-openmp-target \ stencil-vector-taskloop stencil-vector-stl stencil-vector-pstl stencil-vector-raja \ stencil-vector-rangefor stencil-vector-tbb stencil-vector-thread stencil-kokkos stencil-opencl \ stencil-cuda -transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \ +#transpose: transpose-valarray transpose-vector transpose-vector-async transpose-openmp transpose-openmp-target \ transpose-vector-taskloop transpose-vector-stl transpose-vector-pstl transpose-vector-raja \ transpose-vector-rangefor transpose-vector-tbb transpose-vector-thread transpose-kokkos transpose-opencl @@ -85,7 +85,7 @@ transpose: transpose-valarray transpose-vector transpose-vector-async transpose- nstream-vector-taskloop nstream-vector-stl nstream-vector-pstl nstream-vector-raja \ nstream-vector-rangefor nstream-vector-tbb nstream-kokkos nstream-opencl -dgemm: dgemm-vector dgemm-cblas dgemm-cublas +#dgemm: dgemm-vector dgemm-cblas dgemm-cublas vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \ transpose-vector-async transpose-vector-thread From bc3e6b159499bb4c6bca5baead38aaefe9b83df4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:34:00 -0700 Subject: [PATCH 144/245] switch from std::vector to prk::vector --- Cxx11/nstream-vector-tbb.cc | 6 +++--- Cxx11/p2p-hyperplane-vector-tbb.cc | 2 +- Cxx11/p2p-innerloop-vector-tbb.cc | 2 +- Cxx11/p2p-vector-tbb.cc | 4 ++-- Cxx11/stencil-vector-tbb.cc | 6 +++--- Cxx11/stencil_tbb.hpp | 20 ++++++++++---------- Cxx11/transpose-vector-tbb.cc | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Cxx11/nstream-vector-tbb.cc b/Cxx11/nstream-vector-tbb.cc index 0859e0654..2c507af18 100644 --- a/Cxx11/nstream-vector-tbb.cc +++ b/Cxx11/nstream-vector-tbb.cc @@ -117,9 +117,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A(length); - std::vector B(length); - std::vector C(length); + prk::vector A(length); + prk::vector B(length); + prk::vector C(length); double scalar(3); diff --git a/Cxx11/p2p-hyperplane-vector-tbb.cc b/Cxx11/p2p-hyperplane-vector-tbb.cc index 9c523a369..426006182 100644 --- a/Cxx11/p2p-hyperplane-vector-tbb.cc +++ b/Cxx11/p2p-hyperplane-vector-tbb.cc @@ -123,7 +123,7 @@ int main(int argc, char* argv[]) auto pipeline_time = 0.0; // silence compiler warning - std::vector grid(n*n,0.0); + prk::vector grid(n*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j grid(n*n,0.0); + prk::vector grid(n*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j & grid) +void SequentialSweep(int m, int n, prk::vector & grid) { for (auto i=1; i grid(m*n,0.0); + prk::vector grid(m*n,0.0); // set boundary values (bottom and left side of grid) for (auto j=0; j & in, std::vector & out) +void nothing(const int n, const int t, prk::vector & in, prk::vector & out) { std::cout << "You are trying to use a stencil that does not exist." << std::endl; std::cout << "Please generate the new stencil using the code generator." << std::endl; @@ -170,8 +170,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in(n*n); - std::vector out(n*n); + prk::vector in(n*n); + prk::vector out(n*n); tbb::blocked_range2d range(0, n, tile_size, 0, n, tile_size); tbb::parallel_for( range, [&](decltype(range)& r) { diff --git a/Cxx11/stencil_tbb.hpp b/Cxx11/stencil_tbb.hpp index 7b68173a9..edc168be2 100644 --- a/Cxx11/stencil_tbb.hpp +++ b/Cxx11/stencil_tbb.hpp @@ -1,4 +1,4 @@ -void star1(const int n, const int t, std::vector & in, std::vector & out) { +void star1(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(1, n-1, t, 1, n-1, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -13,7 +13,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star2(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(2, n-2, t, 2, n-2, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -32,7 +32,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star3(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(3, n-3, t, 3, n-3, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -55,7 +55,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star4(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(4, n-4, t, 4, n-4, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -82,7 +82,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star5(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(5, n-5, t, 5, n-5, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -113,7 +113,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid1(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(1, n-1, t, 1, n-1, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -131,7 +131,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid2(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(2, n-2, t, 2, n-2, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -163,7 +163,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid3(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(3, n-3, t, 3, n-3, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -217,7 +217,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid4(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(4, n-4, t, 4, n-4, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { @@ -301,7 +301,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid5(const int n, const int t, prk::vector & in, prk::vector & out) { tbb::blocked_range2d range(5, n-5, t, 5, n-5, t); tbb::parallel_for( range, [&](decltype(range)& r ) { for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) { diff --git a/Cxx11/transpose-vector-tbb.cc b/Cxx11/transpose-vector-tbb.cc index d154677fd..8e31954b6 100644 --- a/Cxx11/transpose-vector-tbb.cc +++ b/Cxx11/transpose-vector-tbb.cc @@ -112,8 +112,8 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A(order*order); - std::vector B(order*order); + prk::vector A(order*order); + prk::vector B(order*order); tbb::blocked_range2d range(0, order, tile_size, 0, order, tile_size); tbb::parallel_for( range, [&](decltype(range)& r) { From 5cf87f75247f3962123fb5acb7571b2e786a12a2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:48:53 -0700 Subject: [PATCH 145/245] use prk::vector instead of std::vector --- Cxx11/transpose-cblas.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/transpose-cblas.cc b/Cxx11/transpose-cblas.cc index 9f7f17b07..6add9f18d 100644 --- a/Cxx11/transpose-cblas.cc +++ b/Cxx11/transpose-cblas.cc @@ -105,9 +105,9 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A(order*order); - std::vector B(order*order,0.0); - std::vector T(order*order); + prk::vector A(order*order); + prk::vector B(order*order,0.0); + prk::vector T(order*order); double one[1] = {1.0}; // fill A with the sequence 0 to order^2-1 as doubles From fd46cfdc07c50b7b4095bc561517f7fd6a3981ce Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:49:22 -0700 Subject: [PATCH 146/245] use prk::vector instead of std::vector --- Cxx11/nstream-vector-rangefor.cc | 6 +++--- Cxx11/stencil-vector-rangefor.cc | 6 +++--- Cxx11/stencil_rangefor.hpp | 20 ++++++++++---------- Cxx11/transpose-vector-rangefor.cc | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cxx11/nstream-vector-rangefor.cc b/Cxx11/nstream-vector-rangefor.cc index 2bdadea3d..56948aaa0 100644 --- a/Cxx11/nstream-vector-rangefor.cc +++ b/Cxx11/nstream-vector-rangefor.cc @@ -112,9 +112,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A(length,0.0); - std::vector B(length,2.0); - std::vector C(length,2.0); + prk::vector A(length,0.0); + prk::vector B(length,2.0); + prk::vector C(length,2.0); auto range = prk::range(0,length); diff --git a/Cxx11/stencil-vector-rangefor.cc b/Cxx11/stencil-vector-rangefor.cc index 040bde745..4ec0eb06e 100644 --- a/Cxx11/stencil-vector-rangefor.cc +++ b/Cxx11/stencil-vector-rangefor.cc @@ -63,7 +63,7 @@ #include "prk_util.h" #include "stencil_seq.hpp" -void nothing(const int n, const int t, std::vector & in, std::vector & out) +void nothing(const int n, const int t, prk::vector & in, prk::vector & out) { std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; @@ -164,8 +164,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in(n*n); - std::vector out(n*n); + prk::vector in(n*n); + prk::vector out(n*n); // initialize the input and output arrays auto range = prk::range(0,n); diff --git a/Cxx11/stencil_rangefor.hpp b/Cxx11/stencil_rangefor.hpp index f1ecb729e..85b23d342 100644 --- a/Cxx11/stencil_rangefor.hpp +++ b/Cxx11/stencil_rangefor.hpp @@ -1,4 +1,4 @@ -void star1(const int n, const int t, std::vector & in, std::vector & out) { +void star1(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(1,n-1); for (auto i : inside) { PRAGMA_SIMD @@ -13,7 +13,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star2(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(2,n-2); for (auto i : inside) { PRAGMA_SIMD @@ -32,7 +32,7 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star3(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(3,n-3); for (auto i : inside) { PRAGMA_SIMD @@ -55,7 +55,7 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star4(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(4,n-4); for (auto i : inside) { PRAGMA_SIMD @@ -82,7 +82,7 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void star5(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(5,n-5); for (auto i : inside) { PRAGMA_SIMD @@ -113,7 +113,7 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid1(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(1,n-1); for (auto i : inside) { PRAGMA_SIMD @@ -131,7 +131,7 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid2(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(2,n-2); for (auto i : inside) { PRAGMA_SIMD @@ -163,7 +163,7 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid3(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(3,n-3); for (auto i : inside) { PRAGMA_SIMD @@ -217,7 +217,7 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid4(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(4,n-4); for (auto i : inside) { PRAGMA_SIMD @@ -301,7 +301,7 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { +void grid5(const int n, const int t, prk::vector & in, prk::vector & out) { auto inside = prk::range(5,n-5); for (auto i : inside) { PRAGMA_SIMD diff --git a/Cxx11/transpose-vector-rangefor.cc b/Cxx11/transpose-vector-rangefor.cc index 3d2e4f9f1..2d4ba5449 100644 --- a/Cxx11/transpose-vector-rangefor.cc +++ b/Cxx11/transpose-vector-rangefor.cc @@ -103,8 +103,8 @@ int main(int argc, char * argv[]) auto trans_time = 0.0; - std::vector A(order*order); - std::vector B(order*order,0.0); + prk::vector A(order*order); + prk::vector B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); From 6bf07538edc128ec9be36cfe497df23c0fe3d784 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:50:14 -0700 Subject: [PATCH 147/245] use prk::vector instead of std::vector --- Cxx11/transpose-vector-thread.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-vector-thread.cc b/Cxx11/transpose-vector-thread.cc index 44071ca95..1ac5c693a 100644 --- a/Cxx11/transpose-vector-thread.cc +++ b/Cxx11/transpose-vector-thread.cc @@ -130,8 +130,8 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector A(order*order); - std::vector B(order*order,0.0); + prk::vector A(order*order); + prk::vector B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); From 9dddc2ce65280b9d5fc677290b94ef449ec7921b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:54:53 -0700 Subject: [PATCH 148/245] use prk::vector instead of std::vector --- Cxx11/transpose-vector-async.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/transpose-vector-async.cc b/Cxx11/transpose-vector-async.cc index c68b8c463..c23011d75 100644 --- a/Cxx11/transpose-vector-async.cc +++ b/Cxx11/transpose-vector-async.cc @@ -126,8 +126,8 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - std::vector A(order*order); - std::vector B(order*order,0.0); + prk::vector A(order*order); + prk::vector B(order*order,0.0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(A.begin(), A.end(), 0.0); From 7ccf646ecae92bc9fefa0f6c637b3f8c61ac8620 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:55:38 -0700 Subject: [PATCH 149/245] use prk::vector instead of std::vector --- Cxx11/nstream-vector-taskloop.cc | 6 +++--- Cxx11/stencil-vector-taskloop.cc | 6 +++--- Cxx11/stencil_taskloop.hpp | 20 ++++++++++---------- Cxx11/transpose-vector-taskloop.cc | 4 ++-- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc index 95bd5c925..3f4c8f1d6 100644 --- a/Cxx11/nstream-vector-taskloop.cc +++ b/Cxx11/nstream-vector-taskloop.cc @@ -120,9 +120,9 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector A(length); - std::vector B(length); - std::vector C(length); + prk::vector A(length); + prk::vector B(length); + prk::vector C(length); double scalar = 3.0; diff --git a/Cxx11/stencil-vector-taskloop.cc b/Cxx11/stencil-vector-taskloop.cc index 971d71db1..6cc5fb0cd 100644 --- a/Cxx11/stencil-vector-taskloop.cc +++ b/Cxx11/stencil-vector-taskloop.cc @@ -63,7 +63,7 @@ #include "prk_util.h" #include "stencil_taskloop.hpp" -void nothing(const int n, const int t, std::vector & in, std::vector & out, const int gs) +void nothing(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; @@ -174,8 +174,8 @@ int main(int argc, char* argv[]) auto stencil_time = 0.0; - std::vector in(n*n);; - std::vector out(n*n);; + prk::vector in(n*n);; + prk::vector out(n*n);; OMP_PARALLEL() OMP_MASTER diff --git a/Cxx11/stencil_taskloop.hpp b/Cxx11/stencil_taskloop.hpp index 856f41995..874f122cc 100644 --- a/Cxx11/stencil_taskloop.hpp +++ b/Cxx11/stencil_taskloop.hpp @@ -1,4 +1,4 @@ -void star1(const int n, const int t, std::vector & in, std::vector & out, const int gs) { +void star1(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=1; it & in, std::vector & in, std::vector & out, const int gs) { +void star2(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=2; it & in, std::vector & in, std::vector & out, const int gs) { +void star3(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=3; it & in, std::vector & in, std::vector & out, const int gs) { +void star4(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=4; it & in, std::vector & in, std::vector & out, const int gs) { +void star5(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=5; it & in, std::vector & in, std::vector & out, const int gs) { +void grid1(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=1; it & in, std::vector & in, std::vector & out, const int gs) { +void grid2(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=2; it & in, std::vector & in, std::vector & out, const int gs) { +void grid3(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=3; it & in, std::vector & in, std::vector & out, const int gs) { +void grid4(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=4; it & in, std::vector & in, std::vector & out, const int gs) { +void grid5(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) { OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) ) for (auto it=5; it A(order*order); - std::vector B(order*order); + prk::vector A(order*order); + prk::vector B(order*order); auto trans_time = 0.0; From d530b2dc5327304a0a9093002574d41c886fef14 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:57:47 -0700 Subject: [PATCH 150/245] better=simpler use of STL --- Cxx11/nstream-vector-boost-compute.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Cxx11/nstream-vector-boost-compute.cc b/Cxx11/nstream-vector-boost-compute.cc index 619c02374..785d496a9 100644 --- a/Cxx11/nstream-vector-boost-compute.cc +++ b/Cxx11/nstream-vector-boost-compute.cc @@ -119,8 +119,7 @@ int main(int argc, char * argv[]) auto nstream_time = 0.0; - std::vector h_A; - h_A.resize(length); + std::vector h_A(length); const float scalar(3); From d8b28a965f276058e6a3ba9f2aee78e0c6a7fef6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 21:59:09 -0700 Subject: [PATCH 151/245] add variant for prk::vector --- Cxx11/p2p-kernel.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Cxx11/p2p-kernel.h b/Cxx11/p2p-kernel.h index f402eba37..54b88c6c6 100644 --- a/Cxx11/p2p-kernel.h +++ b/Cxx11/p2p-kernel.h @@ -24,6 +24,17 @@ inline void sweep_tile(int startm, int endm, } } +inline void sweep_tile(int startm, int endm, + int startn, int endn, + int n, prk::vector & grid) +{ + for (auto i=startm; i & grid) +{ + for (int i=startm; i Date: Sat, 20 Apr 2019 21:59:42 -0700 Subject: [PATCH 152/245] try to implement prk::vector - works for some impls --- Cxx11/prk_util.h | 73 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 889e7e87d..3ad580f55 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -180,7 +180,7 @@ namespace prk { #endif } - template + template class vector { private: @@ -189,26 +189,65 @@ namespace prk { public: - vector(size_t n) { - //this->data_ = new T[n]; - this->data_ = prk::malloc(n); - } + vector(size_t n) { + //this->data_ = new T[n]; + this->data_ = prk::malloc(n); + } + + vector(size_t n, T v) { + //this->data_ = new T[n]; + this->data_ = prk::malloc(n); + for (size_t i=0; idata_[i] = v; + } + + ~vector() { + //delete[] this->data_; + prk::free(this->data_); + } + + T * data() { + return this->data_; + } + + size_t size() { + return this->size_; + } + +#if 0 + T const & operator[] (int n) const { + return this->data_[n]; + } + + T & operator[] (int n) { + return this->data_[n]; + } +#endif - vector(size_t n, T v) { - //this->data_ = new T[n]; - this->data_ = prk::malloc(n); - for (size_t i=0; idata_[i] = v; - } + T const & operator[] (size_t n) const { + return this->data_[n]; + } - ~vector() { - //delete[] this->data_; - prk::free(this->data_); - } + T & operator[] (size_t n) { + return this->data_[n]; + } - T & operator[] (size_t n) { - return this->data_[n]; - } + T * begin() { + return &(this->data_[0]); + } + + T * end() { + return &(this->data_[this->size_]); + } +#if 0 + T & begin() { + return this->data_[0]; + } + + T & end() { + return this->data_[this->size_]; + } +#endif }; static inline double wtime(void) From 2970a8417effd7c6cb7dce2f107c482b0173de9d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 22:01:22 -0700 Subject: [PATCH 153/245] add versions that use prk::vector rather than STL --- Cxx11/dgemm.cc | 224 ++++++++++++++++++++++++++++++++++++++++++ Cxx11/nstream.cc | 166 ++++++++++++++++++++++++++++++++ Cxx11/p2p.cc | 185 +++++++++++++++++++++++++++++++++++ Cxx11/sparse.cc | 235 +++++++++++++++++++++++++++++++++++++++++++++ Cxx11/stencil.cc | 231 ++++++++++++++++++++++++++++++++++++++++++++ Cxx11/transpose.cc | 177 ++++++++++++++++++++++++++++++++++ 6 files changed, 1218 insertions(+) create mode 100644 Cxx11/dgemm.cc create mode 100644 Cxx11/nstream.cc create mode 100644 Cxx11/p2p.cc create mode 100644 Cxx11/sparse.cc create mode 100644 Cxx11/stencil.cc create mode 100644 Cxx11/transpose.cc diff --git a/Cxx11/dgemm.cc b/Cxx11/dgemm.cc new file mode 100644 index 000000000..5d7fa7897 --- /dev/null +++ b/Cxx11/dgemm.cc @@ -0,0 +1,224 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +void prk_dgemm(const int order, + const prk::vector & A, + const prk::vector & B, + prk::vector & C) +{ + PRAGMA_SIMD + for (auto i=0; i & A, + const prk::vector & B, + prk::vector & C) +{ + for (auto it=0; it [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + if (tile_size <= 0) tile_size = order; + + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (tile_size < order) { + std::cout << "Tile size = " << tile_size << std::endl; + } else { + std::cout << "Untiled (IKJ loop order)" << std::endl; + } + + ////////////////////////////////////////////////////////////////////// + /// Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double dgemm_time(0); + + prk::vector A(order*order); + prk::vector B(order*order); + prk::vector C(order*order,0.0); + for (auto i=0; i(order); + const auto reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); + const auto checksum = prk::reduce(C.begin(), C.end(), 0.0); + + const auto epsilon = 1.0e-8; + const auto residuum = std::abs(checksum-reference)/reference; + if (residuum < epsilon) { +#if VERBOSE + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#endif + std::cout << "Solution validates" << std::endl; + auto avgtime = dgemm_time/iterations; + auto nflops = 2.0 * std::pow(forder,3); + std::cout << "Rate (MF/s): " << 1.0e-6 * nflops/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "Reference checksum = " << reference << "\n" + << "Actual checksum = " << checksum << std::endl; +#if VERBOSE + for (auto i=0; i <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto nstream_time = 0.0; + + prk::vector A(length,0.0); + prk::vector B(length,2.0); + prk::vector C(length,2.0); + + double scalar = 3.0; + + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) nstream_time = prk::wtime(); + + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + std::cout << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/p2p.cc b/Cxx11/p2p.cc new file mode 100644 index 000000000..119fecfe2 --- /dev/null +++ b/Cxx11/p2p.cc @@ -0,0 +1,185 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an m*n grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// C99-ification by Jeff Hammond, February 2016. +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "p2p-kernel.h" + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 pipeline execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int m, n; + int mc, nc; + try { + if (argc < 4){ + throw " <# iterations> [ ]"; + } + + // number of times to run the pipeline algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // grid dimensions + m = std::atoi(argv[2]); + n = std::atoi(argv[3]); + if (m < 1 || n < 1) { + throw "ERROR: grid dimensions must be positive"; + } else if ( static_cast(m)*static_cast(n) > INT_MAX) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // grid chunk dimensions + mc = (argc > 4) ? std::atoi(argv[4]) : m; + nc = (argc > 5) ? std::atoi(argv[5]) : n; + if (mc < 1 || mc > m || nc < 1 || nc > n) { + std::cout << "WARNING: grid chunk dimensions invalid: " << mc << nc << " (ignoring)" << std::endl; + mc = m; + nc = n; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid sizes = " << m << ", " << n << std::endl; + std::cout << "Grid chunk sizes = " << mc << ", " << nc << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto pipeline_time = 0.0; // silence compiler warning + + prk::vector grid(m*n,0.0);; + + { + // set boundary values (bottom and left side of grid) + for (int j=0; j(j); + } + for (int i=0; i(i); + } + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) pipeline_time = prk::wtime(); + + double * RESTRICT pgrid = grid.data(); + + if (mc==m && nc==n) { + for (int i=1; i epsilon) { + std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)] + << " does not match verification value " << corner_val << std::endl; + return 1; + } + +#ifdef VERBOSE + std::cout << "Solution validates; verification value = " << corner_val << std::endl; +#else + std::cout << "Solution validates" << std::endl; +#endif + auto avgtime = pipeline_time/iterations; + std::cout << "Rate (MFlops/s): " + << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime + << " Avg time (s): " << avgtime << std::endl; + + return 0; +} diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc new file mode 100644 index 000000000..38fb68deb --- /dev/null +++ b/Cxx11/sparse.cc @@ -0,0 +1,235 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +static inline size_t offset(size_t i, size_t j, size_t lsize) +{ + return (i+(j<> 1) & 0x5555555555555555) | ((x << 1) & 0xaaaaaaaaaaaaaaaa); + x = ((x >> 2) & 0x3333333333333333) | ((x << 2) & 0xcccccccccccccccc); + x = ((x >> 4) & 0x0f0f0f0f0f0f0f0f) | ((x << 4) & 0xf0f0f0f0f0f0f0f0); + x = ((x >> 8) & 0x00ff00ff00ff00ff) | ((x << 8) & 0xff00ff00ff00ff00); + x = ((x >> 16) & 0x0000ffff0000ffff) | ((x << 16) & 0xffff0000ffff0000); + x = ((x >> 32) & 0x00000000ffffffff) | ((x << 32) & 0xffffffff00000000); + return ( x >> (8*sizeof(uint64_t)-shift_in_bits) ); +} + +#if SCRAMBLE + #define REVERSE(a,b) reverse((a),(b)) +#else + #define REVERSE(a,b) (a) +#endif + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 Sparse matrix-vector multiplication" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, lsize, radius, stencil_size; + size_t size, size2, nent; + double sparsity; + try { + if (argc < 4) { + throw "Usage: <# iterations> <2log grid size> ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + lsize = std::atoi(argv[2]); + if (lsize < 1) { + throw "ERROR: grid dimension must be positive"; + } + size_t lsize2 = 2*lsize; + size = 1L< matrix(nent,0.0); + prk::vector colIndex(nent,0); + prk::vector vector(size2,0.0); + prk::vector result(size2,0.0); + + double sparse_time(0); + + { + for (size_t row=0; row epsilon) { + std::cout << "ERROR: Vector norm = " << vector_sum + << " Reference vector norm = " << reference_sum << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "Reference sum = " << reference_sum + << ", vector sum = " << vector_sum << std::endl; +#endif + double avgtime = sparse_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * (2.*nent)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/stencil.cc b/Cxx11/stencil.cc new file mode 100644 index 000000000..f0aab6461 --- /dev/null +++ b/Cxx11/stencil.cc @@ -0,0 +1,231 @@ + +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "stencil_seq.hpp" + +void nothing(const int n, const int t, prk::vector & in, prk::vector & out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + // n will never be zero - this is to silence compiler warnings. + if (n==0 || t==0) std::cout << in.size() << out.size() << std::endl; + std::abort(); +} + +int main(int argc, char* argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, n, radius, tile_size; + bool star = true; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto stencil_time = 0.0; + + prk::vector in(n*n); + prk::vector out(n*n); + + { + for (auto it=0; it(i+j); + out[i*n+j] = 0.0; + } + } + } + } + + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + // Apply the stencil operator + stencil(n, tile_size, in, out); + // Add constant to solution to force refresh of neighbor data, if any + std::transform(in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); + } + stencil_time = prk::wtime() - stencil_time; + } + + ////////////////////////////////////////////////////////////////////// + // Analyze and output results. + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); + + // compute L1 norm in parallel + double norm = 0.0; + for (auto i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + return 1; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*(size_t)stencil_size+1L) * active_points; + auto avgtime = stencil_time/iterations; + std::cout << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } + + return 0; +} diff --git a/Cxx11/transpose.cc b/Cxx11/transpose.cc new file mode 100644 index 000000000..7907bae56 --- /dev/null +++ b/Cxx11/transpose.cc @@ -0,0 +1,177 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> [tile size] +/// +/// An optional parameter specifies the tile size used to divide the +/// individual matrix blocks for improved cache and TLB performance. +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11 Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + int order; + int tile_size; + try { + if (argc < 3) { + throw "Usage: <# iterations> [tile size]"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = (argc>3) ? std::atoi(argv[3]) : 32; + // a negative tile size means no tiling of the local transpose + if (tile_size <= 0) tile_size = order; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + std::cout << "Tile size = " << tile_size << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + auto trans_time = 0.0; + + prk::vector A(order*order); + prk::vector B(order*order,0.0); + + // fill A with the sequence 0 to order^2-1 as doubles + std::iota(A.begin(), A.end(), 0.0); + + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) trans_time = prk::wtime(); + + // transpose the matrix + if (tile_size < order) { + for (auto it=0; it(ij)*(1.+iterations)+addit; + abserr += std::fabs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const auto epsilon = 1.0e-8; + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + auto avgtime = trans_time/iterations; + auto bytes = (size_t)order * (size_t)order * sizeof(double); + std::cout << "Rate (MB/s): " << 1.0e-6 * (2L*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + return 1; + } + + return 0; +} + + From c4f79aaeb0a4139a5b70d741729064864026c085 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 22:01:31 -0700 Subject: [PATCH 154/245] add versions that use prk::vector rather than STL --- Cxx11/stencil_seq.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Cxx11/stencil_seq.hpp b/Cxx11/stencil_seq.hpp index 4ed03972e..c6af093af 100644 --- a/Cxx11/stencil_seq.hpp +++ b/Cxx11/stencil_seq.hpp @@ -1,4 +1,4 @@ -void star1(const int n, const int t, std::vector & in, std::vector & out) { +void star1(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=1; it & in, std::vector & in, std::vector & out) { +void star2(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=2; it & in, std::vector & in, std::vector & out) { +void star3(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=3; it & in, std::vector & in, std::vector & out) { +void star4(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=4; it & in, std::vector & in, std::vector & out) { +void star5(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=5; it & in, std::vector & in, std::vector & out) { +void grid1(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=1; it & in, std::vector & in, std::vector & out) { +void grid2(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=2; it & in, std::vector & in, std::vector & out) { +void grid3(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=3; it & in, std::vector & in, std::vector & out) { +void grid4(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=4; it & in, std::vector & in, std::vector & out) { +void grid5(const int n, const int t, prk::vector & in, prk::vector & out) { for (auto it=5; it Date: Sat, 20 Apr 2019 22:01:52 -0700 Subject: [PATCH 155/245] ignore more stuff --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 66948e148..5a90844a4 100644 --- a/.gitignore +++ b/.gitignore @@ -277,3 +277,7 @@ FORTRAN/transpose-ornlacc RUST/p2p/Cargo.lock RUST/stencil/Cargo.lock RUST/transpose/Cargo.lock +nstream +../C1z/p2p-avx +../C1z/p2p-sse +../C1z/p2p-hyperplane-openmp From 75a60ab98f4ae7b2cdbd2b9a0e3548c781732a87 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 20 Apr 2019 22:06:19 -0700 Subject: [PATCH 156/245] cleanup stencil codegen for vector classes --- Cxx11/generate-cxx-stencil.py | 22 +- Cxx11/stencil-vector.cc | 2 +- Cxx11/stencil_vector.hpp | 435 ++++++++++++++++++++++++++++++++++ 3 files changed, 453 insertions(+), 6 deletions(-) create mode 100644 Cxx11/stencil_vector.hpp diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 18d826acd..2f557fe3f 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -53,7 +53,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' }\n') src.write(' }\n') elif (model=='taskloop'): - src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out, const int gs) {\n') + src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out, const int gs) {\n') src.write(' OMP_TASKLOOP_COLLAPSE(2, firstprivate(n) shared(in,out) grainsize(gs) )\n') src.write(' for (auto it='+str(radius)+'; it & in, std::vector & out) {\n') + src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out) {\n') src.write(' auto inside = prk::range('+str(radius)+',n-'+str(radius)+');\n') src.write(' for (auto i : inside) {\n') src.write(' PRAGMA_SIMD\n') @@ -128,7 +128,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') elif (model=='tbb'): - src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') + src.write('void '+pattern+str(radius)+'(const int n, const int t, prk::vector & in, prk::vector & out) {\n') src.write(' tbb::blocked_range2d range('+str(radius)+', n-'+str(radius)+', t, '+str(radius)+', n-'+str(radius)+', t);\n') src.write(' tbb::parallel_for( range, [&](decltype(range)& r ) {\n') src.write(' for (auto i=r.rows().begin(); i!=r.rows().end(); ++i ) {\n') @@ -151,7 +151,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' if ( ('+str(radius)+' <= i) && (i < n-'+str(radius)+') && ('+str(radius)+' <= j) && (j < n-'+str(radius)+') ) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' }\n') - else: + elif (model=='vector'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') src.write(' for (auto it='+str(radius)+'; it & in, prk::vector & out) {\n') + src.write(' for (auto it='+str(radius)+'; it & in, std::vector & out) { diff --git a/Cxx11/stencil_vector.hpp b/Cxx11/stencil_vector.hpp new file mode 100644 index 000000000..4ed03972e --- /dev/null +++ b/Cxx11/stencil_vector.hpp @@ -0,0 +1,435 @@ +void star1(const int n, const int t, std::vector & in, std::vector & out) { + for (auto it=1; it & in, std::vector & out) { + for (auto it=2; it & in, std::vector & out) { + for (auto it=3; it & in, std::vector & out) { + for (auto it=4; it & in, std::vector & out) { + for (auto it=5; it & in, std::vector & out) { + for (auto it=1; it & in, std::vector & out) { + for (auto it=2; it & in, std::vector & out) { + for (auto it=3; it & in, std::vector & out) { + for (auto it=4; it & in, std::vector & out) { + for (auto it=5; it Date: Mon, 22 Apr 2019 11:40:20 -0700 Subject: [PATCH 157/245] clean new targets (prk::vector sequential) --- Cxx11/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 3953c4d80..508ee8e47 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -222,6 +222,7 @@ clean: -rm -f *.optrpt -rm -f *.dwarf -rm -rf *.dSYM # Mac + -rm -f nstream transpose stencil p2p sparse dgemm -rm -f *-vector -rm -f *-valarray -rm -f *-openmp From 1636d95a7181d74855041513a026e0f84afb08e1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 11:40:35 -0700 Subject: [PATCH 158/245] silence GCC warning --- Cxx11/nstream-vector-taskloop.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/nstream-vector-taskloop.cc b/Cxx11/nstream-vector-taskloop.cc index 3f4c8f1d6..3f5385a37 100644 --- a/Cxx11/nstream-vector-taskloop.cc +++ b/Cxx11/nstream-vector-taskloop.cc @@ -73,8 +73,8 @@ int main(int argc, char * argv[]) /// Read and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations, gs, offset; - size_t length; + int iterations; + size_t length, gs, offset; try { if (argc < 3) { throw "Usage: <# iterations> "; From 801a315359000c8b404a680d5ed189099708fcb5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 11:40:53 -0700 Subject: [PATCH 159/245] silence GCC warning --- Cxx11/sparse-vector.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc index c521528e8..93f80b863 100644 --- a/Cxx11/sparse-vector.cc +++ b/Cxx11/sparse-vector.cc @@ -124,7 +124,7 @@ int main(int argc, char* argv[]) if (lsize < 1) { throw "ERROR: grid dimension must be positive"; } - size_t lsize2 = 2*lsize; + //size_t lsize2 = 2*lsize; size = 1L< Date: Mon, 22 Apr 2019 12:11:14 -0700 Subject: [PATCH 160/245] add new impls --- Cxx11/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 508ee8e47..040fc6c7f 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -66,7 +66,7 @@ else EXTRA += target endif -all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) +all: sequential vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ @@ -87,6 +87,8 @@ all: vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sy #dgemm: dgemm-vector dgemm-cblas dgemm-cublas +sequential: p2p stencil transpose nstream dgemm sparse + vector: p2p-vector p2p-hyperplane-vector stencil-vector transpose-vector nstream-vector sparse-vector dgemm-vector \ transpose-vector-async transpose-vector-thread From 7ffd12c65d012c216ff922894c3ab3fa3024293b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 12:11:25 -0700 Subject: [PATCH 161/245] reorder loops --- Cxx11/dgemm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/dgemm.cc b/Cxx11/dgemm.cc index 5d7fa7897..8b1560e55 100644 --- a/Cxx11/dgemm.cc +++ b/Cxx11/dgemm.cc @@ -83,8 +83,8 @@ void prk_dgemm(const int order, const int tile_size, prk::vector & C) { for (auto it=0; it Date: Mon, 22 Apr 2019 12:11:49 -0700 Subject: [PATCH 162/245] silent compiler warning --- Cxx11/sparse-vector.cc | 3 ++- Cxx11/sparse.cc | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Cxx11/sparse-vector.cc b/Cxx11/sparse-vector.cc index 93f80b863..9b96ae8b8 100644 --- a/Cxx11/sparse-vector.cc +++ b/Cxx11/sparse-vector.cc @@ -105,7 +105,8 @@ int main(int argc, char* argv[]) // Process and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations, lsize, radius, stencil_size; + int iterations, lsize; + unsigned radius, stencil_size; size_t size, size2, nent; double sparsity; try { diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc index 38fb68deb..79c76de72 100644 --- a/Cxx11/sparse.cc +++ b/Cxx11/sparse.cc @@ -105,7 +105,8 @@ int main(int argc, char* argv[]) // Process and test input parameters ////////////////////////////////////////////////////////////////////// - int iterations, lsize, radius, stencil_size; + int iterations, lsize; + unsigned radius, stencil_size; size_t size, size2, nent; double sparsity; try { From 892d24711f03ff4aa940ec48b5dcfdf56dc08f19 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 12:16:28 -0700 Subject: [PATCH 163/245] prk::vector impl seems to be working --- Cxx11/prk_util.h | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 3ad580f55..6638abb87 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -84,14 +84,14 @@ namespace prk { - int get_alignment(void) + const int get_alignment(void) { /* a := alignment */ #ifdef PRK_ALIGNMENT int a = PRK_ALIGNMENT; #else - char* temp = getenv("PRK_ALIGNMENT"); - int a = (temp!=NULL) ? atoi(temp) : 64; + const char* temp = std::getenv("PRK_ALIGNMENT"); + int a = (temp!=nullptr) ? std::atoi(temp) : 64; if (a < 8) a = 8; assert( (a & (~a+1)) == a ); /* is power of 2? */ #endif @@ -101,7 +101,7 @@ namespace prk { #if defined(__INTEL_COMPILER) template - T * malloc(size_t n) + T * malloc(size_t n) { const int alignment = prk::get_alignment(); const size_t bytes = n * sizeof(T); @@ -109,24 +109,26 @@ namespace prk { } template - void free(T * p) + void free(T * p) { _mm_free(p); + p = nullptr; } #else // !__INTEL_COMPILER template - void * malloc(size_t bytes) + T * malloc(size_t n) { - const int alignment = prk_get_alignment(); + const int alignment = prk::get_alignment(); + const size_t bytes = n * sizeof(T); // We cannot use C11 aligned_alloc on Mac. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69680 */ // GCC claims to be C11 without knowing if glibc is compliant... #if !defined(__GNUC__) && \ !defined(__APPLE__) && \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) + defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && 0 \ // From ISO C11: // @@ -140,16 +142,16 @@ namespace prk { // Thus, if we do not round up the bytes to be a multiple // of the alignment, we violate ISO C. - size_t padded = bytes; - size_t excess = bytes % alignment; + const size_t padded = bytes; + const size_t excess = bytes % alignment; if (excess>0) padded += (alignment - excess); return aligned_alloc(alignment,padded); #else - T * ptr = NULL; - int ret = posix_memalign(&ptr,alignment,bytes); - if (ret!=0) ptr = NULL; + T * ptr = nullptr; + const int ret = posix_memalign((void**)&ptr,alignment,bytes); + if (ret!=0) ptr = nullptr; return ptr; #endif @@ -157,9 +159,10 @@ namespace prk { } template - void free(void * p) + void free(T * p) { - free(p); + std::free(p); + p = nullptr; } #endif // __INTEL_COMPILER @@ -192,17 +195,23 @@ namespace prk { vector(size_t n) { //this->data_ = new T[n]; this->data_ = prk::malloc(n); + this->size_ = n; } vector(size_t n, T v) { //this->data_ = new T[n]; this->data_ = prk::malloc(n); for (size_t i=0; idata_[i] = v; + this->size_ = n; } ~vector() { //delete[] this->data_; - prk::free(this->data_); + prk::free(this->data_); + } + + void operator~() { + this->~vector(); } T * data() { From 61c1598a0ea23771638bd9cd4eface34a1f287e9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 12:16:46 -0700 Subject: [PATCH 164/245] silence compiler warning --- Cxx11/sparse.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/sparse.cc b/Cxx11/sparse.cc index 79c76de72..f9baa63a5 100644 --- a/Cxx11/sparse.cc +++ b/Cxx11/sparse.cc @@ -125,7 +125,7 @@ int main(int argc, char* argv[]) if (lsize < 1) { throw "ERROR: grid dimension must be positive"; } - size_t lsize2 = 2*lsize; + //size_t lsize2 = 2*lsize; size = 1L< Date: Mon, 22 Apr 2019 12:21:29 -0700 Subject: [PATCH 165/245] clean example for Intel toolchain --- common/make.defs.intel | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/common/make.defs.intel b/common/make.defs.intel index 664d79e0c..cab461c08 100644 --- a/common/make.defs.intel +++ b/common/make.defs.intel @@ -14,7 +14,7 @@ CC=icc -std=c11 -pthread # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=ifort -std08 -fpp # C++11 may not be required but does no harm here. -CXX=icpc -std=c++14 -pthread +CXX=icpc -std=c++17 -pthread # # Compiler flags # @@ -24,7 +24,7 @@ DEFAULT_OPT_FLAGS=-g -O3 -xHOST # If you are compiling for KNL on a Xeon login node, use the following: # DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512 # -DEFAULT_OPT_FLAGS+=-qopt-report=5 +#DEFAULT_OPT_FLAGS+=-qopt-report=5 # # OpenMP flags # @@ -36,6 +36,9 @@ OFFLOADFLAG=-qopenmp-offload=host # # MacOS #OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL @@ -51,7 +54,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx #SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} # # OCCA # @@ -89,8 +92,9 @@ CBLASFLAG=-DMKL -mkl # Mac w/ CUDA emulation via https://github.com/hughperkins/coriander #NVCC=/opt/llvm/cocl/bin/cocl # Linux w/ NVIDIA CUDA -NVCC=nvcc -arch=sm_50 +NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=-arch=sm_50 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # @@ -115,6 +119,8 @@ MPICC=mpiicc -std=c99 COARRAYFLAG=-coarray # multi-node # COARRAYFLAG=-coarray=distributed - +# +# MEMKIND (used in C1z) +# MEMKINDDIR=/home/parallels/PRK/deps MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From 7b170b11c0b23d70f6fad8f952030afded86b5dc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 22 Apr 2019 12:29:46 -0700 Subject: [PATCH 166/245] use .data() instead of &([0]) and dynamic schedule loop in DGEMM CBLAS --- Cxx11/dgemm-cblas.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Cxx11/dgemm-cblas.cc b/Cxx11/dgemm-cblas.cc index 24ae52bae..b17b84785 100644 --- a/Cxx11/dgemm-cblas.cc +++ b/Cxx11/dgemm-cblas.cc @@ -104,7 +104,7 @@ void prk_dgemm(const int order, const double beta = 1.0; cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - n, n, n, alpha, &(A[0]), n, &(B[0]), n, beta, &(C[0]), n); + n, n, n, alpha, A.data(), n, B.data(), n, beta, C.data(), n); } void prk_dgemm(const int order, const int batches, @@ -132,11 +132,11 @@ void prk_dgemm(const int order, const int batches, const int nt, const double beta = 1.0; #ifdef _OPENMP -#pragma omp parallel for schedule(static) num_threads(nt) +#pragma omp parallel for schedule(dynamic) num_threads(nt) #endif for (int b=0; b 1) { - std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " - << batch_threads << " threads)" << std::endl; + std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS with " << batch_threads << " threads)" << std::endl; } else { std::cout << "Batch size = " << std::abs(batches) << " (loop over legacy BLAS sequentially)" << std::endl; } From 036fcd57cc2c4df5cb65364f3e474f18fd22aedb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 28 Apr 2019 10:24:37 -0700 Subject: [PATCH 167/245] Update make.defs.llvm default to CodePlay disable OCCA --- common/make.defs.llvm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index fda35f476..224edb8d9 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -75,9 +75,9 @@ SYCLFLAG+=-std=c++14 # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -86,7 +86,7 @@ SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include # # OCCA # -OCCADIR=${HOME}/prk-repo/Cxx11/occa +#OCCADIR=${HOME}/prk-repo/Cxx11/occa # # TBB # From e1efa89ef35675a3ed8d68bab2b44563ba701dd2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 26 Apr 2019 16:02:49 -0700 Subject: [PATCH 168/245] add kokkos::fence where appropriate --- Cxx11/nstream-kokkos.cc | 4 +++- Cxx11/stencil-kokkos.cc | 31 ++++++++++++++++---------- Cxx11/transpose-kokkos.cc | 46 ++++++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index 9e0af56bd..be425e75b 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -139,6 +139,7 @@ int main(int argc, char * argv[]) C[i] = 2.0; }); Kokkos::fence(); + for (int iter = 0; iter<=iterations; ++iter) { if (iter==1) { @@ -169,8 +170,9 @@ int main(int argc, char * argv[]) double asum(0); Kokkos::parallel_reduce(length, KOKKOS_LAMBDA(size_t const i, double & inner) { - inner += std::fabs(A(i)); + inner += std::fabs(A(i)); }, asum); + Kokkos::fence(); double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc index b92bd4a57..f5c3365ba 100644 --- a/Cxx11/stencil-kokkos.cc +++ b/Cxx11/stencil-kokkos.cc @@ -180,24 +180,30 @@ int main(int argc, char* argv[]) auto tile2 = {tile_size,tile_size}; auto full = Kokkos::MDRangePolicy>(z2,n2,tile2); - Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { - in(i,j) = static_cast(i+j); - out(i,j) = 0.0; - }); + { + Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { + in(i,j) = static_cast(i+j); + out(i,j) = 0.0; + }); + Kokkos::fence(); - for (int iter = 0; iter<=iterations; ++iter) { + for (int iter = 0; iter<=iterations; ++iter) { - if (iter==1) stencil_time = prk::wtime(); + if (iter==1) { + Kokkos::fence(); + stencil_time = prk::wtime(); + } - stencil(n, tile_size, in, out); + stencil(n, tile_size, in, out); - Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { - in(i,j) += 1.0; - }); + Kokkos::parallel_for(full, KOKKOS_LAMBDA(int i, int j) { + in(i,j) += 1.0; + }); + } + Kokkos::fence(); + stencil_time = prk::wtime() - stencil_time; } - stencil_time = prk::wtime() - stencil_time; - ////////////////////////////////////////////////////////////////////// // Analyze and output results. ////////////////////////////////////////////////////////////////////// @@ -211,6 +217,7 @@ int main(int argc, char* argv[]) Kokkos::parallel_reduce(inside, KOKKOS_LAMBDA(int i, int j, double & norm) { norm += std::fabs(out(i,j)); }, norm); + Kokkos::fence(); norm /= active_points; // verify correctness diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index fa35ebb6e..9b5a4f6c0 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -117,6 +117,8 @@ int main(int argc, char * argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// + double trans_time(0); + matrix A("A", order, order); matrix B("B", order, order); @@ -129,32 +131,36 @@ int main(int argc, char * argv[]) auto policy_lr = Kokkos::MDRangePolicy({0,0},order2,tile2); auto policy_rl = Kokkos::MDRangePolicy({0,0},order2,tile2); - Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) { - A(i,j) = static_cast(i*order+j); - B(i,j) = 0.0; - }); - - double trans_time(0); + { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int i, int j) { + A(i,j) = static_cast(i*order+j); + B(i,j) = 0.0; + }); + Kokkos::fence(); - for (int iter = 0; iter<=iterations; ++iter) { + for (int iter = 0; iter<=iterations; ++iter) { - if (iter==1) trans_time = prk::wtime(); + if (iter==1) { + Kokkos::fence(); + trans_time = prk::wtime(); + } - if (permute) { - Kokkos::parallel_for(policy_rl, KOKKOS_LAMBDA(int i, int j) { - B(i,j) += A(j,i); - A(j,i) += 1.0; - }); - } else { - Kokkos::parallel_for(policy_lr, KOKKOS_LAMBDA(int i, int j) { - B(i,j) += A(j,i); - A(j,i) += 1.0; - }); + if (permute) { + Kokkos::parallel_for(policy_rl, KOKKOS_LAMBDA(int i, int j) { + B(i,j) += A(j,i); + A(j,i) += 1.0; + }); + } else { + Kokkos::parallel_for(policy_lr, KOKKOS_LAMBDA(int i, int j) { + B(i,j) += A(j,i); + A(j,i) += 1.0; + }); + } } + Kokkos::fence(); + trans_time = prk::wtime() - trans_time; } - trans_time = prk::wtime() - trans_time; - ////////////////////////////////////////////////////////////////////// /// Analyze and output results ////////////////////////////////////////////////////////////////////// From 576b332e8226a9f3f7219f520ffdb0e88fc212e4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 24 Apr 2019 10:09:16 -0500 Subject: [PATCH 169/245] Flang is mostly Fortran 2008 complete now --- .gitignore | 3 +++ FORTRAN/Makefile | 1 + FORTRAN/dgemm-openmp-target.f90 | 1 - FORTRAN/dgemm-pretty.f90 | 7 +------ FORTRAN/dgemm-taskloop-openmp.f90 | 1 - FORTRAN/dgemm.f90 | 11 ----------- FORTRAN/nstream.f90 | 6 +++--- FORTRAN/stencil-pretty.f90 | 8 -------- FORTRAN/transpose-pretty.f90 | 20 +++++--------------- FORTRAN/transpose.f90 | 17 ----------------- 10 files changed, 13 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index 5a90844a4..a7e76eb32 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,9 @@ octave-workspace # Octave crashes */*__genmod.mod *.patch */*.patch +*.dbg # Flang +*/*.dbg +*/*/*.dbg common/make.defs scripts/small/runfgmpi diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 551c8fc36..26226b648 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -90,6 +90,7 @@ dgemm-pretty: dgemm-pretty.f90 clean: -rm -f *.o -rm -f *.i90 + -rm -f *.dbg -rm -f *__genmod.f90 # Intel Fortran -rm -f *__genmod.mod # Intel Fortran -rm -f *.optrpt diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90 index ed2193bba..3c8ffbeec 100644 --- a/FORTRAN/dgemm-openmp-target.f90 +++ b/FORTRAN/dgemm-openmp-target.f90 @@ -181,7 +181,6 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) - ! TODO: use intrinsic here (except PGI) checksum = 0.0d0 !$omp parallel do simd reduction(+:checksum) do j=1,order diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.f90 index e1e6ac7c2..59983f924 100644 --- a/FORTRAN/dgemm-pretty.f90 +++ b/FORTRAN/dgemm-pretty.f90 @@ -77,11 +77,7 @@ program main real(kind=REAL64), allocatable :: C(:,:) ! buffer to hold output matrix integer(kind=INT64) :: nflops ! runtime variables -#if 1 || defined(PGI) - integer(kind=INT32) :: i -#endif - integer(kind=INT64) :: j - integer(kind=INT32) :: k + integer(kind=INT32) :: i,j,k real(kind=REAL64) :: checksum, reference, residuum real(kind=REAL64) :: t0, t1, dgemm_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -166,7 +162,6 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) - ! TODO: use intrinsic here (except PGI) checksum = 0.0d0 do j=1,order do i=1,order diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.f90 index b127dd356..2b64413d3 100644 --- a/FORTRAN/dgemm-taskloop-openmp.f90 +++ b/FORTRAN/dgemm-taskloop-openmp.f90 @@ -236,7 +236,6 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) - ! TODO: use intrinsic here (except PGI) checksum = 0.0d0 !$omp parallel do simd reduction(+:checksum) do j=1,order diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.f90 index a68eff104..7123882a6 100644 --- a/FORTRAN/dgemm.f90 +++ b/FORTRAN/dgemm.f90 @@ -75,11 +75,6 @@ subroutine prk_dgemm(order, tile_size, A, B, C) do jt=1,order,tile_size do kt=1,order,tile_size do it=1,order,tile_size -#elif defined(PGI) - ! PGI does not support DO CONCURRENT. - do jt=1,order,tile_size - do kt=1,order,tile_size - do it=1,order,tile_size #else do concurrent (jt=1:order:tile_size) do concurrent (kt=1:order:tile_size) @@ -111,11 +106,6 @@ subroutine prk_dgemm(order, tile_size, A, B, C) do k=1,order !$omp simd do i=1,order -#elif defined(PGI) - ! PGI does not support DO CONCURRENT. - do j=1,order - do k=1,order - do i=1,order #else do concurrent (j=1:order) do concurrent (k=1:order) @@ -288,7 +278,6 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) - ! TODO: use intrinsic here (except PGI) checksum = 0.0d0 !$omp parallel do simd reduction(+:checksum) do j=1,order diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.f90 index 6aa9c1529..dc4ee8744 100644 --- a/FORTRAN/nstream.f90 +++ b/FORTRAN/nstream.f90 @@ -192,7 +192,7 @@ program main C(i) = 2 enddo !$omp end do -#elif defined(PGI) +#elif 0 forall (i=1:length) A(i) = 0 B(i) = 2 @@ -229,7 +229,7 @@ program main A(i) = A(i) + B(i) + scalar * C(i) enddo !$omp end do -#elif defined(PGI) +#elif 0 forall (i=1:length) A(i) = A(i) + B(i) + scalar * C(i) end forall @@ -267,7 +267,7 @@ program main ar = ar * length asum = 0 -#if defined(_OPENMP) || defined(PGI) +#if defined(_OPENMP) !$omp parallel do reduction(+:asum) do i=1,length asum = asum + abs(A(i)) diff --git a/FORTRAN/stencil-pretty.f90 b/FORTRAN/stencil-pretty.f90 index e1ec242bf..1119ff731 100644 --- a/FORTRAN/stencil-pretty.f90 +++ b/FORTRAN/stencil-pretty.f90 @@ -285,17 +285,9 @@ program main call initialize_w(is_star,r,W) ! initialize the input and output arrays -#if defined(PGI) - forall (i=1:n, j=1:n) -#else do concurrent (i=1:n, j=1:n) -#endif A(i,j) = cx*(i-1)+cy*(j-1) -#if defined(PGI) - endforall -#else enddo -#endif !B(r+1:n-r,r+1:n-r) = 0 ! minimal B = 0 ! sufficient diff --git a/FORTRAN/transpose-pretty.f90 b/FORTRAN/transpose-pretty.f90 index ee6676401..31c88b378 100644 --- a/FORTRAN/transpose-pretty.f90 +++ b/FORTRAN/transpose-pretty.f90 @@ -73,11 +73,8 @@ program main real(kind=REAL64), allocatable :: B(:,:) ! buffer to hold transposed matrix integer(kind=INT64) :: bytes ! combined size of matrices ! runtime variables -#if defined(PGI) - integer(kind=INT32) :: i -#endif - integer(kind=INT32) :: k - integer(kind=INT64) :: j, o2 ! for loop over order**2 + integer(kind=INT32) :: i,j,k + integer(kind=INT64) :: j2, o2 ! for loop over order**2 real(kind=REAL64) :: abserr ! squared error real(kind=REAL64) :: t0, t1, trans_time, avgtime ! timing parameters real(kind=REAL64), parameter :: epsilon=1.D-8 ! error tolerance @@ -132,7 +129,7 @@ program main ! Fill the original matrix o2 = int(order,INT64)**2 - A = reshape((/ (j, j = 0,o2) /),(/order, order/)) + A = reshape((/ (j2, j2 = 0,o2) /),(/order, order/)) B = 0 t0 = 0 @@ -152,17 +149,10 @@ program main ! ******************************************************************** ! we reuse A here as the reference matrix, to compute the error - A = ( transpose(reshape((/ (j, j = 0,o2) /),(/order, order/))) & + A = ( transpose(reshape((/ (j2, j2 = 0,o2) /),(/order, order/))) & * real(iterations+1,REAL64) ) & + real((iterations*(iterations+1))/2,REAL64) -#if 0 && defined(PGI) - ! PGI generates a segfault here... - abserr = 0.0d0 - forall (j=1:order,i=1:order) - abserr = abserr + (B(i,j) - A(i,j))**2 - endforall - abserr = sqrt(abserr) -#elif defined(PGI) +#if defined(PGI) abserr = 0.0d0 do j=1,order do i=1,order diff --git a/FORTRAN/transpose.f90 b/FORTRAN/transpose.f90 index fdcbde105..d66d56715 100644 --- a/FORTRAN/transpose.f90 +++ b/FORTRAN/transpose.f90 @@ -172,10 +172,6 @@ program main #endif do jt=1,order,tile_size do it=1,order,tile_size -#elif defined(PGI) - ! PGI does not support DO CONCURRENT. - do jt=1,order,tile_size - do it=1,order,tile_size #else do concurrent (jt=1:order:tile_size) do concurrent (it=1:order:tile_size) @@ -196,11 +192,7 @@ program main !$omp do collapse(2) do j=1,order do i=1,order -#elif defined(PGI) - do j=1,order - do i=1,order #else - ! PGI does not support DO CONCURRENT. do concurrent (j=1:order) do concurrent (i=1:order) #endif @@ -242,9 +234,6 @@ program main #endif do jt=1,order,tile_size do it=1,order,tile_size -#elif defined(PGI) - do jt=1,order,tile_size - do it=1,order,tile_size #else do concurrent (jt=1:order:tile_size) do concurrent (it=1:order:tile_size) @@ -265,9 +254,6 @@ program main !$omp do collapse(2) do j=1,order do i=1,order -#elif defined(PGI) - do j=1,order - do i=1,order #else do concurrent (j=1:order) do concurrent (i=1:order) @@ -314,9 +300,6 @@ program main !$omp& reduction(+:abserr) do j=1,order do i=1,order -#elif defined(PGI) - do j=1,order - do i=1,order #else do concurrent (j=1:order) do concurrent (i=1:order) From 390d536fcb0dcc472b3230d67a21911e4c03abc6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Apr 2019 19:02:24 -0700 Subject: [PATCH 170/245] pointless reordering of string --- FORTRAN/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 26226b648..d96f87cce 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -24,7 +24,7 @@ ifeq ($(findstring ifort,$(FC)),ifort) ifeq ($(shell uname -s),Darwin) EXTRA = taskloop else - EXTRA = target coarray taskloop + EXTRA = taskloop target coarray endif endif # GCC (also matches pgfortran so PGI must come after) From c37976c1959ce6c4d8baf5f43389b32ded1c2559 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Apr 2019 19:02:49 -0700 Subject: [PATCH 171/245] add PGI support for IVDEP --- Cxx11/prk_simd.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/prk_simd.h b/Cxx11/prk_simd.h index 742bc4fcb..7daed0911 100644 --- a/Cxx11/prk_simd.h +++ b/Cxx11/prk_simd.h @@ -38,6 +38,8 @@ # define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep) // According to https://github.com/LLNL/RAJA/pull/310, this improves lambda performance # define PRAGMA_INLINE PRAGMA(forceinline recursive) +#elif defined(__PGI) +# define PRAGMA_SIMD PRAGMA(vector) PRAGMA(ivdep) #elif defined(__GNUC__) && defined(__GNUC_MINOR__) && ( ( (__GNUC__ == 4) && (__GNUC_MINOR__ == 9) ) || (__GNUC__ >= 5) ) # define PRAGMA_SIMD PRAGMA(GCC ivdep) # define PRAGMA_INLINE PRAGMA(inline) From f851bcc7ee43cd327beea4f8126c6b9ec562a821 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Apr 2019 19:03:05 -0700 Subject: [PATCH 172/245] return value qualified is ignored --- Cxx11/prk_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 6638abb87..81ca5006f 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -84,7 +84,7 @@ namespace prk { - const int get_alignment(void) + int get_alignment(void) { /* a := alignment */ #ifdef PRK_ALIGNMENT From 9ecc6594e2380df62a4db6ef3debb8b2d420a2a1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Apr 2019 19:03:23 -0700 Subject: [PATCH 173/245] TBB does not support PGI --- Cxx11/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 040fc6c7f..e050c7abd 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -65,8 +65,12 @@ ifeq ($(shell uname -s),Darwin) else EXTRA += target endif +ifneq ($(findstring pgc++,$(CXX)),pgc++) + @echo CXX=$(CXX) + EXTRA += tbb pstl +endif -all: sequential vector valarray openmp taskloop tbb stl pstl rangefor raja kokkos opencl sycl boost-compute $(EXTRA) +all: sequential vector valarray openmp taskloop stl rangefor kokkos opencl sycl boost-compute $(EXTRA) # raja #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ From 0d811ddfe23562e0d0bcfd26598288e59d74a921 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 30 Apr 2019 19:05:02 -0700 Subject: [PATCH 174/245] update PGI example flags --- common/make.defs.pgi | 50 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/common/make.defs.pgi b/common/make.defs.pgi index 1205afff2..78447462d 100644 --- a/common/make.defs.pgi +++ b/common/make.defs.pgi @@ -9,7 +9,7 @@ CC=pgcc -c11 FC=pgfortran -Mpreprocess -Mfreeform #FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Wl,-rpath=/opt/llvm/pgi-flang/lib # C++11 may not be required but does no harm here. -CXX=pgc++ --c++11 +CXX=pgc++ --c++17 # # Compiler flags # @@ -18,9 +18,10 @@ DEFAULT_OPT_FLAGS=-O2 -tp=haswell # OpenMP flags # OPENMPFLAG=-mp #-Minfo=mp,vect -OFFLOADFLAG=-mp #-Minfo=mp,vect -#ORNLACCFLAG=-acc -ta=multicore -Minfo=accel -ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel +OPENMPSIMDFLAG=-mp #-Minfo=mp,vect +OFFLOADFLAG=-mp -ta=multicore #-Minfo=mp,vect +ORNLACCFLAG=-acc -ta=multicore #-Minfo=accel +#ORNLACCFLAG=-acc -ta=tesla:cc70 -Minfo=accel ORNLACCFLAG+=-Mlarge_arrays # # OpenCL flags @@ -31,18 +32,45 @@ ORNLACCFLAG+=-Mlarge_arrays OPENCLDIR=/etc/alternatives/opencl-intel-tools OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL # +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +SYCLFLAG=-I$(SYCLDIR)/include +# ProGTX +# https://github.com/ProGTX/sycl-gtx +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# TBB +# +TBBDIR=./tbb +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +# # Parallel STL, Boost, etc. # BOOSTFLAG=-DUSE_BOOST -I. -PSTLFLAG=-DUSE_PSTL ${OPENMPFLAG} ${TBBFLAG} -KOKKOSDIR=./kokkos -KOKKOSFLAG=-DUSE_KOKKOS -I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -RAJADIR=./raja -RAJAFLAG=-DUSE_RAJA -I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +KOKKOSDIR=/opt/kokkos/pgi +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl +RAJADIR=/opt/raja/pgi +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # # CBLAS for C++ DGEMM # -CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +BLASFLAG= +CBLASFLAG= # # CUDA flags # @@ -78,4 +106,4 @@ CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED # MPI # # Needs PATH and LD_LIBRARY_PATH set appropriately... -MPICC=/opt/pgi/linux86-64/2017/mpi/openmpi/bin/mpicc +MPICC=/opt/pgi/linux86-64/2019/mpi/openmpi/bin/mpicc From d6f307191b0f2c99a92eefe9f3764c485ba0ae65 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 21 Mar 2019 20:29:54 -0700 Subject: [PATCH 175/245] work around Clang FE issue --- Cxx11/stencil-openmp-target.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Cxx11/stencil-openmp-target.cc b/Cxx11/stencil-openmp-target.cc index 76a4fab3c..62c5b73b0 100644 --- a/Cxx11/stencil-openmp-target.cc +++ b/Cxx11/stencil-openmp-target.cc @@ -65,11 +65,8 @@ void nothing(const int n, const int t, const double * RESTRICT in, double * RESTRICT out) { - std::cout << "You are trying to use a stencil that does not exist." << std::endl; - std::cout << "Please generate the new stencil using the code generator." << std::endl; - // n will never be zero - this is to silence compiler warnings. - if (n==0) std::cout << in << out << std::endl; - std::abort(); + // use arguments to silence compiler warnings + out[0] = in[0] + n + t; } int main(int argc, char* argv[]) From a3eadc2f5bd7e15a3a4955e33e5787b343b23eea Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 1 May 2019 14:57:27 -0700 Subject: [PATCH 176/245] fix errors --- Cxx11/Makefile | 1 - Cxx11/prk_util.h | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e050c7abd..b166d65d4 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -66,7 +66,6 @@ else EXTRA += target endif ifneq ($(findstring pgc++,$(CXX)),pgc++) - @echo CXX=$(CXX) EXTRA += tbb pstl endif diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 81ca5006f..d2caae1b7 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -101,7 +101,7 @@ namespace prk { #if defined(__INTEL_COMPILER) template - T * malloc(size_t n) + T * malloc(size_t n) { const int alignment = prk::get_alignment(); const size_t bytes = n * sizeof(T); @@ -109,7 +109,7 @@ namespace prk { } template - void free(T * p) + void free(T * p) { _mm_free(p); p = nullptr; From bd2dc5f1f21022eca5b3af670e5f0d06bdb3a68d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 16 Apr 2019 15:27:28 -0700 Subject: [PATCH 177/245] add hyperplane OpenMP to C1z --- C1z/Makefile | 2 +- C1z/p2p-hyperplane-openmp.c | 201 ++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 C1z/p2p-hyperplane-openmp.c diff --git a/C1z/Makefile b/C1z/Makefile index aac123acc..20619ccaf 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -49,7 +49,7 @@ serial: nstream p2p p2p-innerloop stencil transpose thread: transpose-thread -openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp +openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp mpi: nstream-mpi diff --git a/C1z/p2p-hyperplane-openmp.c b/C1z/p2p-hyperplane-openmp.c new file mode 100644 index 000000000..a24d73f89 --- /dev/null +++ b/C1z/p2p-hyperplane-openmp.c @@ -0,0 +1,201 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Pipeline +/// +/// PURPOSE: This program tests the efficiency with which point-to-point +/// synchronization can be carried out. It does so by executing +/// a pipelined algorithm on an n^2 grid. The first array dimension +/// is distributed among the threads (stripwise decomposition). +/// +/// USAGE: The program takes as input the +/// dimensions of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following +/// functions are used in this program: +/// +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - C99-ification by Jeff Hammond, February 2016. +/// - C11-ification by Jeff Hammond, June 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "p2p-kernel.h" + +int main(int argc, char* argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION); +#ifdef _OPENMP + printf("C11/OpenMP HYPERPLANE pipeline execution on 2D grid\n"); +#else + printf("C11/Serial HYPERPLANE pipeline execution on 2D grid\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + // number of times to run the pipeline algorithm + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // grid dimensions + int n = atoi(argv[2]); + if (n < 1) { + printf("ERROR: grid dimension must be positive: %d\n", n); + return 1; + } + + // grid chunk dimensions + int nc = (argc > 3) ? atoi(argv[3]) : 1; + nc = MAX(1,nc); + nc = MIN(n,nc); + + // number of grid blocks + int nb = (n-1)/nc; + if ((n-1)%nc) nb++; + +#ifdef _OPENMP + printf("Number of threads (max) = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Grid sizes = %d,%d\n", n, n); + printf("Grid chunk sizes, blocks = %d,%d\n", nc, nb); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double pipeline_time = 0.0; // silence compiler warning + + size_t bytes = n*n*sizeof(double); + double * restrict grid = prk_malloc(bytes); + + OMP_PARALLEL() + { + OMP_FOR() + for (int i=0; i epsilon) { + printf("ERROR: checksum %lf does not match verification value %lf\n", grid[(n-1)*n+(n-1)], corner_val); + return 1; + } + + prk_free(grid); + +#ifdef VERBOSE + printf("Solution validates; verification value = %lf\n", corner_val ); +#else + printf("Solution validates\n" ); +#endif + double avgtime = pipeline_time/iterations; + printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 2.0e-6 * ( (n-1)*(n-1) )/avgtime, avgtime ); + + return 0; +} From 0f390e9cd21cfdd31d7be7208ff8bb424ebb66b0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 1 May 2019 15:10:51 -0700 Subject: [PATCH 178/245] add hyperplane to make and travis --- C1z/Makefile | 3 +++ travis/build-run-prk.sh | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/C1z/Makefile b/C1z/Makefile index 20619ccaf..0bcd9a95f 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -68,6 +68,9 @@ ispc: transpose-ispc p2p-innerloop: p2p-innerloop-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ +p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + %-mpi: %-mpi.c prk_util.h $(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 194e7ca51..6f9ced772 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -152,6 +152,8 @@ case "$PRK_TARGET" in $PRK_TARGET_PATH/p2p 10 1024 1024 $PRK_TARGET_PATH/p2p 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane 10 1024 32 $PRK_TARGET_PATH/stencil 10 1000 $PRK_TARGET_PATH/transpose 10 1024 32 #echo "Test stencil code generator" @@ -174,6 +176,8 @@ case "$PRK_TARGET" in ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 32 $PRK_TARGET_PATH/stencil-openmp 10 1000 $PRK_TARGET_PATH/transpose-openmp 10 1024 32 #echo "Test stencil code generator" From 78247f4935d6de290ba846da478fa9332d2d084a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 2 May 2019 09:36:53 -0700 Subject: [PATCH 179/245] fix Fortran OpenMP p2p tasks --- FORTRAN/p2p-tasks-openmp.f90 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/FORTRAN/p2p-tasks-openmp.f90 b/FORTRAN/p2p-tasks-openmp.f90 index 74c7dcd90..fcebe0c82 100644 --- a/FORTRAN/p2p-tasks-openmp.f90 +++ b/FORTRAN/p2p-tasks-openmp.f90 @@ -173,12 +173,17 @@ program main enddo enddo - do j=1,n + !$omp task private(j) firstprivate(n) shared(grid) + do j=2,n grid(1,j) = real(j-1,REAL64) enddo - do i=1,m + !$omp end task + !$omp task private(i) firstprivate(m) shared(grid) + do i=2,m grid(i,1) = real(i-1,REAL64) enddo + !$omp end task + !$omp taskwait do k=0,iterations From 24972d2b4ee7a4842fb626c962f9126874be47b3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 6 May 2019 08:48:04 -0700 Subject: [PATCH 180/245] fix Travis and regular builds --- C1z/Makefile | 2 +- travis/build-run-prk.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/C1z/Makefile b/C1z/Makefile index 0bcd9a95f..535ed9eaa 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -45,7 +45,7 @@ endif all: serial thread openmp taskloop $(EXTRA) -serial: nstream p2p p2p-innerloop stencil transpose +serial: nstream p2p p2p-innerloop p2p-hyperplane stencil transpose thread: transpose-thread diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 6f9ced772..962ecc1f4 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -148,7 +148,7 @@ case "$PRK_TARGET" in echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs # C11 without external parallelism - ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop + ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop p2p-hyperplane $PRK_TARGET_PATH/p2p 10 1024 1024 $PRK_TARGET_PATH/p2p 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop 10 1024 From bcf4e4d29e366845f2a6dfc245fb21624bbe7fe7 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 30 Nov 2017 11:41:21 -0800 Subject: [PATCH 181/245] coefficients need to be double precision --- FORTRAN/generate-fortran-stencil.py | 2 +- FORTRAN/stencil_openmp.f90 | 2196 +++++++++++++------------- FORTRAN/stencil_pretty.f90 | 2160 +++++++++++++------------- FORTRAN/stencil_serial.f90 | 2160 +++++++++++++------------- FORTRAN/stencil_target.f90 | 2214 +++++++++++++-------------- FORTRAN/stencil_taskloop.f90 | 2160 +++++++++++++------------- 6 files changed, 5455 insertions(+), 5437 deletions(-) diff --git a/FORTRAN/generate-fortran-stencil.py b/FORTRAN/generate-fortran-stencil.py index 0f915163e..aabf96dcb 100755 --- a/FORTRAN/generate-fortran-stencil.py +++ b/FORTRAN/generate-fortran-stencil.py @@ -40,7 +40,7 @@ def codegen(src,pattern,stencil_size,radius,W,model): if i-radius>=0: opi='+' else: opi='' if ( W[j][i] != 0.0): - src.write(' + in(i'+opi+str(i-radius)+',j'+opj+str(j-radius)+') * ('+str(W[j][i])+') &\n') + src.write(' + in(i'+opi+str(i-radius)+',j'+opj+str(j-radius)+') * ('+str(W[j][i])+'d0) &\n') src.write('+0.0\n') src.write(' end do\n') if (model=='openmp' or model=='target' or model=='taskloop'): diff --git a/FORTRAN/stencil_openmp.f90 b/FORTRAN/stencil_openmp.f90 index e5e0a1ce9..0b5ea36bd 100644 --- a/FORTRAN/stencil_openmp.f90 +++ b/FORTRAN/stencil_openmp.f90 @@ -8,12 +8,14 @@ subroutine star1(n, in, out) !$omp do do i=1,n-1-1 !$omp simd + do j=1,n-1-1 + do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i+0,j-1) * (-0.5) & - + in(i-1,j+0) * (-0.5) & - + in(i+1,j+0) * (0.5) & - + in(i+0,j+1) * (0.5) & + + in(i+0,j-1) * (-0.5d0) & + + in(i-1,j+0) * (-0.5d0) & + + in(i+1,j+0) * (0.5d0) & + + in(i+0,j+1) * (0.5d0) & +0.0 end do !$omp end simd @@ -31,16 +33,18 @@ subroutine star2(n, in, out) !$omp do do i=2,n-2-1 !$omp simd + do j=2,n-2-1 + do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i+0,j-2) * (-0.125) & - + in(i+0,j-1) * (-0.25) & - + in(i-2,j+0) * (-0.125) & - + in(i-1,j+0) * (-0.25) & - + in(i+1,j+0) * (0.25) & - + in(i+2,j+0) * (0.125) & - + in(i+0,j+1) * (0.25) & - + in(i+0,j+2) * (0.125) & + + in(i+0,j-2) * (-0.125d0) & + + in(i+0,j-1) * (-0.25d0) & + + in(i-2,j+0) * (-0.125d0) & + + in(i-1,j+0) * (-0.25d0) & + + in(i+1,j+0) * (0.25d0) & + + in(i+2,j+0) * (0.125d0) & + + in(i+0,j+1) * (0.25d0) & + + in(i+0,j+2) * (0.125d0) & +0.0 end do !$omp end simd @@ -58,20 +62,22 @@ subroutine star3(n, in, out) !$omp do do i=3,n-3-1 !$omp simd + do j=3,n-3-1 + do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i+0,j-3) * (-0.05555555555555555) & - + in(i+0,j-2) * (-0.08333333333333333) & - + in(i+0,j-1) * (-0.16666666666666666) & - + in(i-3,j+0) * (-0.05555555555555555) & - + in(i-2,j+0) * (-0.08333333333333333) & - + in(i-1,j+0) * (-0.16666666666666666) & - + in(i+1,j+0) * (0.16666666666666666) & - + in(i+2,j+0) * (0.08333333333333333) & - + in(i+3,j+0) * (0.05555555555555555) & - + in(i+0,j+1) * (0.16666666666666666) & - + in(i+0,j+2) * (0.08333333333333333) & - + in(i+0,j+3) * (0.05555555555555555) & + + in(i+0,j-3) * (-0.05555555555555555d0) & + + in(i+0,j-2) * (-0.08333333333333333d0) & + + in(i+0,j-1) * (-0.16666666666666666d0) & + + in(i-3,j+0) * (-0.05555555555555555d0) & + + in(i-2,j+0) * (-0.08333333333333333d0) & + + in(i-1,j+0) * (-0.16666666666666666d0) & + + in(i+1,j+0) * (0.16666666666666666d0) & + + in(i+2,j+0) * (0.08333333333333333d0) & + + in(i+3,j+0) * (0.05555555555555555d0) & + + in(i+0,j+1) * (0.16666666666666666d0) & + + in(i+0,j+2) * (0.08333333333333333d0) & + + in(i+0,j+3) * (0.05555555555555555d0) & +0.0 end do !$omp end simd @@ -89,24 +95,26 @@ subroutine star4(n, in, out) !$omp do do i=4,n-4-1 !$omp simd + do j=4,n-4-1 + do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i+0,j-4) * (-0.03125) & - + in(i+0,j-3) * (-0.041666666666666664) & - + in(i+0,j-2) * (-0.0625) & - + in(i+0,j-1) * (-0.125) & - + in(i-4,j+0) * (-0.03125) & - + in(i-3,j+0) * (-0.041666666666666664) & - + in(i-2,j+0) * (-0.0625) & - + in(i-1,j+0) * (-0.125) & - + in(i+1,j+0) * (0.125) & - + in(i+2,j+0) * (0.0625) & - + in(i+3,j+0) * (0.041666666666666664) & - + in(i+4,j+0) * (0.03125) & - + in(i+0,j+1) * (0.125) & - + in(i+0,j+2) * (0.0625) & - + in(i+0,j+3) * (0.041666666666666664) & - + in(i+0,j+4) * (0.03125) & + + in(i+0,j-4) * (-0.03125d0) & + + in(i+0,j-3) * (-0.041666666666666664d0) & + + in(i+0,j-2) * (-0.0625d0) & + + in(i+0,j-1) * (-0.125d0) & + + in(i-4,j+0) * (-0.03125d0) & + + in(i-3,j+0) * (-0.041666666666666664d0) & + + in(i-2,j+0) * (-0.0625d0) & + + in(i-1,j+0) * (-0.125d0) & + + in(i+1,j+0) * (0.125d0) & + + in(i+2,j+0) * (0.0625d0) & + + in(i+3,j+0) * (0.041666666666666664d0) & + + in(i+4,j+0) * (0.03125d0) & + + in(i+0,j+1) * (0.125d0) & + + in(i+0,j+2) * (0.0625d0) & + + in(i+0,j+3) * (0.041666666666666664d0) & + + in(i+0,j+4) * (0.03125d0) & +0.0 end do !$omp end simd @@ -124,28 +132,30 @@ subroutine star5(n, in, out) !$omp do do i=5,n-5-1 !$omp simd + do j=5,n-5-1 + do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i+0,j-5) * (-0.02) & - + in(i+0,j-4) * (-0.025) & - + in(i+0,j-3) * (-0.03333333333333333) & - + in(i+0,j-2) * (-0.05) & - + in(i+0,j-1) * (-0.1) & - + in(i-5,j+0) * (-0.02) & - + in(i-4,j+0) * (-0.025) & - + in(i-3,j+0) * (-0.03333333333333333) & - + in(i-2,j+0) * (-0.05) & - + in(i-1,j+0) * (-0.1) & - + in(i+1,j+0) * (0.1) & - + in(i+2,j+0) * (0.05) & - + in(i+3,j+0) * (0.03333333333333333) & - + in(i+4,j+0) * (0.025) & - + in(i+5,j+0) * (0.02) & - + in(i+0,j+1) * (0.1) & - + in(i+0,j+2) * (0.05) & - + in(i+0,j+3) * (0.03333333333333333) & - + in(i+0,j+4) * (0.025) & - + in(i+0,j+5) * (0.02) & + + in(i+0,j-5) * (-0.02d0) & + + in(i+0,j-4) * (-0.025d0) & + + in(i+0,j-3) * (-0.03333333333333333d0) & + + in(i+0,j-2) * (-0.05d0) & + + in(i+0,j-1) * (-0.1d0) & + + in(i-5,j+0) * (-0.02d0) & + + in(i-4,j+0) * (-0.025d0) & + + in(i-3,j+0) * (-0.03333333333333333d0) & + + in(i-2,j+0) * (-0.05d0) & + + in(i-1,j+0) * (-0.1d0) & + + in(i+1,j+0) * (0.1d0) & + + in(i+2,j+0) * (0.05d0) & + + in(i+3,j+0) * (0.03333333333333333d0) & + + in(i+4,j+0) * (0.025d0) & + + in(i+5,j+0) * (0.02d0) & + + in(i+0,j+1) * (0.1d0) & + + in(i+0,j+2) * (0.05d0) & + + in(i+0,j+3) * (0.03333333333333333d0) & + + in(i+0,j+4) * (0.025d0) & + + in(i+0,j+5) * (0.02d0) & +0.0 end do !$omp end simd @@ -163,32 +173,34 @@ subroutine star6(n, in, out) !$omp do do i=6,n-6-1 !$omp simd + do j=6,n-6-1 + do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i+0,j-6) * (-0.013888888888888888) & - + in(i+0,j-5) * (-0.016666666666666666) & - + in(i+0,j-4) * (-0.020833333333333332) & - + in(i+0,j-3) * (-0.027777777777777776) & - + in(i+0,j-2) * (-0.041666666666666664) & - + in(i+0,j-1) * (-0.08333333333333333) & - + in(i-6,j+0) * (-0.013888888888888888) & - + in(i-5,j+0) * (-0.016666666666666666) & - + in(i-4,j+0) * (-0.020833333333333332) & - + in(i-3,j+0) * (-0.027777777777777776) & - + in(i-2,j+0) * (-0.041666666666666664) & - + in(i-1,j+0) * (-0.08333333333333333) & - + in(i+1,j+0) * (0.08333333333333333) & - + in(i+2,j+0) * (0.041666666666666664) & - + in(i+3,j+0) * (0.027777777777777776) & - + in(i+4,j+0) * (0.020833333333333332) & - + in(i+5,j+0) * (0.016666666666666666) & - + in(i+6,j+0) * (0.013888888888888888) & - + in(i+0,j+1) * (0.08333333333333333) & - + in(i+0,j+2) * (0.041666666666666664) & - + in(i+0,j+3) * (0.027777777777777776) & - + in(i+0,j+4) * (0.020833333333333332) & - + in(i+0,j+5) * (0.016666666666666666) & - + in(i+0,j+6) * (0.013888888888888888) & + + in(i+0,j-6) * (-0.013888888888888888d0) & + + in(i+0,j-5) * (-0.016666666666666666d0) & + + in(i+0,j-4) * (-0.020833333333333332d0) & + + in(i+0,j-3) * (-0.027777777777777776d0) & + + in(i+0,j-2) * (-0.041666666666666664d0) & + + in(i+0,j-1) * (-0.08333333333333333d0) & + + in(i-6,j+0) * (-0.013888888888888888d0) & + + in(i-5,j+0) * (-0.016666666666666666d0) & + + in(i-4,j+0) * (-0.020833333333333332d0) & + + in(i-3,j+0) * (-0.027777777777777776d0) & + + in(i-2,j+0) * (-0.041666666666666664d0) & + + in(i-1,j+0) * (-0.08333333333333333d0) & + + in(i+1,j+0) * (0.08333333333333333d0) & + + in(i+2,j+0) * (0.041666666666666664d0) & + + in(i+3,j+0) * (0.027777777777777776d0) & + + in(i+4,j+0) * (0.020833333333333332d0) & + + in(i+5,j+0) * (0.016666666666666666d0) & + + in(i+6,j+0) * (0.013888888888888888d0) & + + in(i+0,j+1) * (0.08333333333333333d0) & + + in(i+0,j+2) * (0.041666666666666664d0) & + + in(i+0,j+3) * (0.027777777777777776d0) & + + in(i+0,j+4) * (0.020833333333333332d0) & + + in(i+0,j+5) * (0.016666666666666666d0) & + + in(i+0,j+6) * (0.013888888888888888d0) & +0.0 end do !$omp end simd @@ -206,36 +218,38 @@ subroutine star7(n, in, out) !$omp do do i=7,n-7-1 !$omp simd + do j=7,n-7-1 + do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i+0,j-7) * (-0.01020408163265306) & - + in(i+0,j-6) * (-0.011904761904761904) & - + in(i+0,j-5) * (-0.014285714285714285) & - + in(i+0,j-4) * (-0.017857142857142856) & - + in(i+0,j-3) * (-0.023809523809523808) & - + in(i+0,j-2) * (-0.03571428571428571) & - + in(i+0,j-1) * (-0.07142857142857142) & - + in(i-7,j+0) * (-0.01020408163265306) & - + in(i-6,j+0) * (-0.011904761904761904) & - + in(i-5,j+0) * (-0.014285714285714285) & - + in(i-4,j+0) * (-0.017857142857142856) & - + in(i-3,j+0) * (-0.023809523809523808) & - + in(i-2,j+0) * (-0.03571428571428571) & - + in(i-1,j+0) * (-0.07142857142857142) & - + in(i+1,j+0) * (0.07142857142857142) & - + in(i+2,j+0) * (0.03571428571428571) & - + in(i+3,j+0) * (0.023809523809523808) & - + in(i+4,j+0) * (0.017857142857142856) & - + in(i+5,j+0) * (0.014285714285714285) & - + in(i+6,j+0) * (0.011904761904761904) & - + in(i+7,j+0) * (0.01020408163265306) & - + in(i+0,j+1) * (0.07142857142857142) & - + in(i+0,j+2) * (0.03571428571428571) & - + in(i+0,j+3) * (0.023809523809523808) & - + in(i+0,j+4) * (0.017857142857142856) & - + in(i+0,j+5) * (0.014285714285714285) & - + in(i+0,j+6) * (0.011904761904761904) & - + in(i+0,j+7) * (0.01020408163265306) & + + in(i+0,j-7) * (-0.01020408163265306d0) & + + in(i+0,j-6) * (-0.011904761904761904d0) & + + in(i+0,j-5) * (-0.014285714285714285d0) & + + in(i+0,j-4) * (-0.017857142857142856d0) & + + in(i+0,j-3) * (-0.023809523809523808d0) & + + in(i+0,j-2) * (-0.03571428571428571d0) & + + in(i+0,j-1) * (-0.07142857142857142d0) & + + in(i-7,j+0) * (-0.01020408163265306d0) & + + in(i-6,j+0) * (-0.011904761904761904d0) & + + in(i-5,j+0) * (-0.014285714285714285d0) & + + in(i-4,j+0) * (-0.017857142857142856d0) & + + in(i-3,j+0) * (-0.023809523809523808d0) & + + in(i-2,j+0) * (-0.03571428571428571d0) & + + in(i-1,j+0) * (-0.07142857142857142d0) & + + in(i+1,j+0) * (0.07142857142857142d0) & + + in(i+2,j+0) * (0.03571428571428571d0) & + + in(i+3,j+0) * (0.023809523809523808d0) & + + in(i+4,j+0) * (0.017857142857142856d0) & + + in(i+5,j+0) * (0.014285714285714285d0) & + + in(i+6,j+0) * (0.011904761904761904d0) & + + in(i+7,j+0) * (0.01020408163265306d0) & + + in(i+0,j+1) * (0.07142857142857142d0) & + + in(i+0,j+2) * (0.03571428571428571d0) & + + in(i+0,j+3) * (0.023809523809523808d0) & + + in(i+0,j+4) * (0.017857142857142856d0) & + + in(i+0,j+5) * (0.014285714285714285d0) & + + in(i+0,j+6) * (0.011904761904761904d0) & + + in(i+0,j+7) * (0.01020408163265306d0) & +0.0 end do !$omp end simd @@ -253,40 +267,42 @@ subroutine star8(n, in, out) !$omp do do i=8,n-8-1 !$omp simd + do j=8,n-8-1 + do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i+0,j-8) * (-0.0078125) & - + in(i+0,j-7) * (-0.008928571428571428) & - + in(i+0,j-6) * (-0.010416666666666666) & - + in(i+0,j-5) * (-0.0125) & - + in(i+0,j-4) * (-0.015625) & - + in(i+0,j-3) * (-0.020833333333333332) & - + in(i+0,j-2) * (-0.03125) & - + in(i+0,j-1) * (-0.0625) & - + in(i-8,j+0) * (-0.0078125) & - + in(i-7,j+0) * (-0.008928571428571428) & - + in(i-6,j+0) * (-0.010416666666666666) & - + in(i-5,j+0) * (-0.0125) & - + in(i-4,j+0) * (-0.015625) & - + in(i-3,j+0) * (-0.020833333333333332) & - + in(i-2,j+0) * (-0.03125) & - + in(i-1,j+0) * (-0.0625) & - + in(i+1,j+0) * (0.0625) & - + in(i+2,j+0) * (0.03125) & - + in(i+3,j+0) * (0.020833333333333332) & - + in(i+4,j+0) * (0.015625) & - + in(i+5,j+0) * (0.0125) & - + in(i+6,j+0) * (0.010416666666666666) & - + in(i+7,j+0) * (0.008928571428571428) & - + in(i+8,j+0) * (0.0078125) & - + in(i+0,j+1) * (0.0625) & - + in(i+0,j+2) * (0.03125) & - + in(i+0,j+3) * (0.020833333333333332) & - + in(i+0,j+4) * (0.015625) & - + in(i+0,j+5) * (0.0125) & - + in(i+0,j+6) * (0.010416666666666666) & - + in(i+0,j+7) * (0.008928571428571428) & - + in(i+0,j+8) * (0.0078125) & + + in(i+0,j-8) * (-0.0078125d0) & + + in(i+0,j-7) * (-0.008928571428571428d0) & + + in(i+0,j-6) * (-0.010416666666666666d0) & + + in(i+0,j-5) * (-0.0125d0) & + + in(i+0,j-4) * (-0.015625d0) & + + in(i+0,j-3) * (-0.020833333333333332d0) & + + in(i+0,j-2) * (-0.03125d0) & + + in(i+0,j-1) * (-0.0625d0) & + + in(i-8,j+0) * (-0.0078125d0) & + + in(i-7,j+0) * (-0.008928571428571428d0) & + + in(i-6,j+0) * (-0.010416666666666666d0) & + + in(i-5,j+0) * (-0.0125d0) & + + in(i-4,j+0) * (-0.015625d0) & + + in(i-3,j+0) * (-0.020833333333333332d0) & + + in(i-2,j+0) * (-0.03125d0) & + + in(i-1,j+0) * (-0.0625d0) & + + in(i+1,j+0) * (0.0625d0) & + + in(i+2,j+0) * (0.03125d0) & + + in(i+3,j+0) * (0.020833333333333332d0) & + + in(i+4,j+0) * (0.015625d0) & + + in(i+5,j+0) * (0.0125d0) & + + in(i+6,j+0) * (0.010416666666666666d0) & + + in(i+7,j+0) * (0.008928571428571428d0) & + + in(i+8,j+0) * (0.0078125d0) & + + in(i+0,j+1) * (0.0625d0) & + + in(i+0,j+2) * (0.03125d0) & + + in(i+0,j+3) * (0.020833333333333332d0) & + + in(i+0,j+4) * (0.015625d0) & + + in(i+0,j+5) * (0.0125d0) & + + in(i+0,j+6) * (0.010416666666666666d0) & + + in(i+0,j+7) * (0.008928571428571428d0) & + + in(i+0,j+8) * (0.0078125d0) & +0.0 end do !$omp end simd @@ -304,44 +320,46 @@ subroutine star9(n, in, out) !$omp do do i=9,n-9-1 !$omp simd + do j=9,n-9-1 + do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i+0,j-9) * (-0.006172839506172839) & - + in(i+0,j-8) * (-0.006944444444444444) & - + in(i+0,j-7) * (-0.007936507936507936) & - + in(i+0,j-6) * (-0.009259259259259259) & - + in(i+0,j-5) * (-0.011111111111111112) & - + in(i+0,j-4) * (-0.013888888888888888) & - + in(i+0,j-3) * (-0.018518518518518517) & - + in(i+0,j-2) * (-0.027777777777777776) & - + in(i+0,j-1) * (-0.05555555555555555) & - + in(i-9,j+0) * (-0.006172839506172839) & - + in(i-8,j+0) * (-0.006944444444444444) & - + in(i-7,j+0) * (-0.007936507936507936) & - + in(i-6,j+0) * (-0.009259259259259259) & - + in(i-5,j+0) * (-0.011111111111111112) & - + in(i-4,j+0) * (-0.013888888888888888) & - + in(i-3,j+0) * (-0.018518518518518517) & - + in(i-2,j+0) * (-0.027777777777777776) & - + in(i-1,j+0) * (-0.05555555555555555) & - + in(i+1,j+0) * (0.05555555555555555) & - + in(i+2,j+0) * (0.027777777777777776) & - + in(i+3,j+0) * (0.018518518518518517) & - + in(i+4,j+0) * (0.013888888888888888) & - + in(i+5,j+0) * (0.011111111111111112) & - + in(i+6,j+0) * (0.009259259259259259) & - + in(i+7,j+0) * (0.007936507936507936) & - + in(i+8,j+0) * (0.006944444444444444) & - + in(i+9,j+0) * (0.006172839506172839) & - + in(i+0,j+1) * (0.05555555555555555) & - + in(i+0,j+2) * (0.027777777777777776) & - + in(i+0,j+3) * (0.018518518518518517) & - + in(i+0,j+4) * (0.013888888888888888) & - + in(i+0,j+5) * (0.011111111111111112) & - + in(i+0,j+6) * (0.009259259259259259) & - + in(i+0,j+7) * (0.007936507936507936) & - + in(i+0,j+8) * (0.006944444444444444) & - + in(i+0,j+9) * (0.006172839506172839) & + + in(i+0,j-9) * (-0.006172839506172839d0) & + + in(i+0,j-8) * (-0.006944444444444444d0) & + + in(i+0,j-7) * (-0.007936507936507936d0) & + + in(i+0,j-6) * (-0.009259259259259259d0) & + + in(i+0,j-5) * (-0.011111111111111112d0) & + + in(i+0,j-4) * (-0.013888888888888888d0) & + + in(i+0,j-3) * (-0.018518518518518517d0) & + + in(i+0,j-2) * (-0.027777777777777776d0) & + + in(i+0,j-1) * (-0.05555555555555555d0) & + + in(i-9,j+0) * (-0.006172839506172839d0) & + + in(i-8,j+0) * (-0.006944444444444444d0) & + + in(i-7,j+0) * (-0.007936507936507936d0) & + + in(i-6,j+0) * (-0.009259259259259259d0) & + + in(i-5,j+0) * (-0.011111111111111112d0) & + + in(i-4,j+0) * (-0.013888888888888888d0) & + + in(i-3,j+0) * (-0.018518518518518517d0) & + + in(i-2,j+0) * (-0.027777777777777776d0) & + + in(i-1,j+0) * (-0.05555555555555555d0) & + + in(i+1,j+0) * (0.05555555555555555d0) & + + in(i+2,j+0) * (0.027777777777777776d0) & + + in(i+3,j+0) * (0.018518518518518517d0) & + + in(i+4,j+0) * (0.013888888888888888d0) & + + in(i+5,j+0) * (0.011111111111111112d0) & + + in(i+6,j+0) * (0.009259259259259259d0) & + + in(i+7,j+0) * (0.007936507936507936d0) & + + in(i+8,j+0) * (0.006944444444444444d0) & + + in(i+9,j+0) * (0.006172839506172839d0) & + + in(i+0,j+1) * (0.05555555555555555d0) & + + in(i+0,j+2) * (0.027777777777777776d0) & + + in(i+0,j+3) * (0.018518518518518517d0) & + + in(i+0,j+4) * (0.013888888888888888d0) & + + in(i+0,j+5) * (0.011111111111111112d0) & + + in(i+0,j+6) * (0.009259259259259259d0) & + + in(i+0,j+7) * (0.007936507936507936d0) & + + in(i+0,j+8) * (0.006944444444444444d0) & + + in(i+0,j+9) * (0.006172839506172839d0) & +0.0 end do !$omp end simd @@ -359,12 +377,14 @@ subroutine grid1(n, in, out) !$omp do do i=1,n-1-1 !$omp simd + do j=1,n-1-1 + do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i-1,j-1) * (-0.25) & - + in(i+1,j-1) * (-0.25) & - + in(i-1,j+1) * (-0.25) & - + in(i+1,j+1) * (0.25) & + + in(i-1,j-1) * (-0.25d0) & + + in(i+1,j-1) * (-0.25d0) & + + in(i-1,j+1) * (-0.25d0) & + + in(i+1,j+1) * (0.25d0) & +0.0 end do !$omp end simd @@ -382,22 +402,24 @@ subroutine grid2(n, in, out) !$omp do do i=2,n-2-1 !$omp simd + do j=2,n-2-1 + do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i-2,j-2) * (-0.0625) & - + in(i+1,j-2) * (-0.020833333333333332) & - + in(i+2,j-2) * (-0.020833333333333332) & - + in(i-1,j-1) * (-0.125) & - + in(i+1,j-1) * (-0.125) & - + in(i+2,j-1) * (-0.125) & - + in(i-2,j+1) * (-0.020833333333333332) & - + in(i-1,j+1) * (-0.125) & - + in(i+1,j+1) * (0.125) & - + in(i+2,j+1) * (0.020833333333333332) & - + in(i-2,j+2) * (-0.020833333333333332) & - + in(i-1,j+2) * (-0.125) & - + in(i+1,j+2) * (0.020833333333333332) & - + in(i+2,j+2) * (0.0625) & + + in(i-2,j-2) * (-0.0625d0) & + + in(i+1,j-2) * (-0.020833333333333332d0) & + + in(i+2,j-2) * (-0.020833333333333332d0) & + + in(i-1,j-1) * (-0.125d0) & + + in(i+1,j-1) * (-0.125d0) & + + in(i+2,j-1) * (-0.125d0) & + + in(i-2,j+1) * (-0.020833333333333332d0) & + + in(i-1,j+1) * (-0.125d0) & + + in(i+1,j+1) * (0.125d0) & + + in(i+2,j+1) * (0.020833333333333332d0) & + + in(i-2,j+2) * (-0.020833333333333332d0) & + + in(i-1,j+2) * (-0.125d0) & + + in(i+1,j+2) * (0.020833333333333332d0) & + + in(i+2,j+2) * (0.0625d0) & +0.0 end do !$omp end simd @@ -415,38 +437,40 @@ subroutine grid3(n, in, out) !$omp do do i=3,n-3-1 !$omp simd + do j=3,n-3-1 + do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i-3,j-3) * (-0.027777777777777776) & - + in(i+1,j-3) * (-0.005555555555555556) & - + in(i+2,j-3) * (-0.005555555555555556) & - + in(i+3,j-3) * (-0.005555555555555556) & - + in(i-2,j-2) * (-0.041666666666666664) & - + in(i+1,j-2) * (-0.013888888888888888) & - + in(i+2,j-2) * (-0.013888888888888888) & - + in(i+3,j-2) * (-0.013888888888888888) & - + in(i-1,j-1) * (-0.08333333333333333) & - + in(i+1,j-1) * (-0.08333333333333333) & - + in(i+2,j-1) * (-0.08333333333333333) & - + in(i+3,j-1) * (-0.08333333333333333) & - + in(i-3,j+1) * (-0.005555555555555556) & - + in(i-2,j+1) * (-0.013888888888888888) & - + in(i-1,j+1) * (-0.08333333333333333) & - + in(i+1,j+1) * (0.08333333333333333) & - + in(i+2,j+1) * (0.013888888888888888) & - + in(i+3,j+1) * (0.005555555555555556) & - + in(i-3,j+2) * (-0.005555555555555556) & - + in(i-2,j+2) * (-0.013888888888888888) & - + in(i-1,j+2) * (-0.08333333333333333) & - + in(i+1,j+2) * (0.013888888888888888) & - + in(i+2,j+2) * (0.041666666666666664) & - + in(i+3,j+2) * (0.005555555555555556) & - + in(i-3,j+3) * (-0.005555555555555556) & - + in(i-2,j+3) * (-0.013888888888888888) & - + in(i-1,j+3) * (-0.08333333333333333) & - + in(i+1,j+3) * (0.005555555555555556) & - + in(i+2,j+3) * (0.005555555555555556) & - + in(i+3,j+3) * (0.027777777777777776) & + + in(i-3,j-3) * (-0.027777777777777776d0) & + + in(i+1,j-3) * (-0.005555555555555556d0) & + + in(i+2,j-3) * (-0.005555555555555556d0) & + + in(i+3,j-3) * (-0.005555555555555556d0) & + + in(i-2,j-2) * (-0.041666666666666664d0) & + + in(i+1,j-2) * (-0.013888888888888888d0) & + + in(i+2,j-2) * (-0.013888888888888888d0) & + + in(i+3,j-2) * (-0.013888888888888888d0) & + + in(i-1,j-1) * (-0.08333333333333333d0) & + + in(i+1,j-1) * (-0.08333333333333333d0) & + + in(i+2,j-1) * (-0.08333333333333333d0) & + + in(i+3,j-1) * (-0.08333333333333333d0) & + + in(i-3,j+1) * (-0.005555555555555556d0) & + + in(i-2,j+1) * (-0.013888888888888888d0) & + + in(i-1,j+1) * (-0.08333333333333333d0) & + + in(i+1,j+1) * (0.08333333333333333d0) & + + in(i+2,j+1) * (0.013888888888888888d0) & + + in(i+3,j+1) * (0.005555555555555556d0) & + + in(i-3,j+2) * (-0.005555555555555556d0) & + + in(i-2,j+2) * (-0.013888888888888888d0) & + + in(i-1,j+2) * (-0.08333333333333333d0) & + + in(i+1,j+2) * (0.013888888888888888d0) & + + in(i+2,j+2) * (0.041666666666666664d0) & + + in(i+3,j+2) * (0.005555555555555556d0) & + + in(i-3,j+3) * (-0.005555555555555556d0) & + + in(i-2,j+3) * (-0.013888888888888888d0) & + + in(i-1,j+3) * (-0.08333333333333333d0) & + + in(i+1,j+3) * (0.005555555555555556d0) & + + in(i+2,j+3) * (0.005555555555555556d0) & + + in(i+3,j+3) * (0.027777777777777776d0) & +0.0 end do !$omp end simd @@ -464,60 +488,62 @@ subroutine grid4(n, in, out) !$omp do do i=4,n-4-1 !$omp simd + do j=4,n-4-1 + do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i-4,j-4) * (-0.015625) & - + in(i+1,j-4) * (-0.002232142857142857) & - + in(i+2,j-4) * (-0.002232142857142857) & - + in(i+3,j-4) * (-0.002232142857142857) & - + in(i+4,j-4) * (-0.002232142857142857) & - + in(i-3,j-3) * (-0.020833333333333332) & - + in(i+1,j-3) * (-0.004166666666666667) & - + in(i+2,j-3) * (-0.004166666666666667) & - + in(i+3,j-3) * (-0.004166666666666667) & - + in(i+4,j-3) * (-0.004166666666666667) & - + in(i-2,j-2) * (-0.03125) & - + in(i+1,j-2) * (-0.010416666666666666) & - + in(i+2,j-2) * (-0.010416666666666666) & - + in(i+3,j-2) * (-0.010416666666666666) & - + in(i+4,j-2) * (-0.010416666666666666) & - + in(i-1,j-1) * (-0.0625) & - + in(i+1,j-1) * (-0.0625) & - + in(i+2,j-1) * (-0.0625) & - + in(i+3,j-1) * (-0.0625) & - + in(i+4,j-1) * (-0.0625) & - + in(i-4,j+1) * (-0.002232142857142857) & - + in(i-3,j+1) * (-0.004166666666666667) & - + in(i-2,j+1) * (-0.010416666666666666) & - + in(i-1,j+1) * (-0.0625) & - + in(i+1,j+1) * (0.0625) & - + in(i+2,j+1) * (0.010416666666666666) & - + in(i+3,j+1) * (0.004166666666666667) & - + in(i+4,j+1) * (0.002232142857142857) & - + in(i-4,j+2) * (-0.002232142857142857) & - + in(i-3,j+2) * (-0.004166666666666667) & - + in(i-2,j+2) * (-0.010416666666666666) & - + in(i-1,j+2) * (-0.0625) & - + in(i+1,j+2) * (0.010416666666666666) & - + in(i+2,j+2) * (0.03125) & - + in(i+3,j+2) * (0.004166666666666667) & - + in(i+4,j+2) * (0.002232142857142857) & - + in(i-4,j+3) * (-0.002232142857142857) & - + in(i-3,j+3) * (-0.004166666666666667) & - + in(i-2,j+3) * (-0.010416666666666666) & - + in(i-1,j+3) * (-0.0625) & - + in(i+1,j+3) * (0.004166666666666667) & - + in(i+2,j+3) * (0.004166666666666667) & - + in(i+3,j+3) * (0.020833333333333332) & - + in(i+4,j+3) * (0.002232142857142857) & - + in(i-4,j+4) * (-0.002232142857142857) & - + in(i-3,j+4) * (-0.004166666666666667) & - + in(i-2,j+4) * (-0.010416666666666666) & - + in(i-1,j+4) * (-0.0625) & - + in(i+1,j+4) * (0.002232142857142857) & - + in(i+2,j+4) * (0.002232142857142857) & - + in(i+3,j+4) * (0.002232142857142857) & - + in(i+4,j+4) * (0.015625) & + + in(i-4,j-4) * (-0.015625d0) & + + in(i+1,j-4) * (-0.002232142857142857d0) & + + in(i+2,j-4) * (-0.002232142857142857d0) & + + in(i+3,j-4) * (-0.002232142857142857d0) & + + in(i+4,j-4) * (-0.002232142857142857d0) & + + in(i-3,j-3) * (-0.020833333333333332d0) & + + in(i+1,j-3) * (-0.004166666666666667d0) & + + in(i+2,j-3) * (-0.004166666666666667d0) & + + in(i+3,j-3) * (-0.004166666666666667d0) & + + in(i+4,j-3) * (-0.004166666666666667d0) & + + in(i-2,j-2) * (-0.03125d0) & + + in(i+1,j-2) * (-0.010416666666666666d0) & + + in(i+2,j-2) * (-0.010416666666666666d0) & + + in(i+3,j-2) * (-0.010416666666666666d0) & + + in(i+4,j-2) * (-0.010416666666666666d0) & + + in(i-1,j-1) * (-0.0625d0) & + + in(i+1,j-1) * (-0.0625d0) & + + in(i+2,j-1) * (-0.0625d0) & + + in(i+3,j-1) * (-0.0625d0) & + + in(i+4,j-1) * (-0.0625d0) & + + in(i-4,j+1) * (-0.002232142857142857d0) & + + in(i-3,j+1) * (-0.004166666666666667d0) & + + in(i-2,j+1) * (-0.010416666666666666d0) & + + in(i-1,j+1) * (-0.0625d0) & + + in(i+1,j+1) * (0.0625d0) & + + in(i+2,j+1) * (0.010416666666666666d0) & + + in(i+3,j+1) * (0.004166666666666667d0) & + + in(i+4,j+1) * (0.002232142857142857d0) & + + in(i-4,j+2) * (-0.002232142857142857d0) & + + in(i-3,j+2) * (-0.004166666666666667d0) & + + in(i-2,j+2) * (-0.010416666666666666d0) & + + in(i-1,j+2) * (-0.0625d0) & + + in(i+1,j+2) * (0.010416666666666666d0) & + + in(i+2,j+2) * (0.03125d0) & + + in(i+3,j+2) * (0.004166666666666667d0) & + + in(i+4,j+2) * (0.002232142857142857d0) & + + in(i-4,j+3) * (-0.002232142857142857d0) & + + in(i-3,j+3) * (-0.004166666666666667d0) & + + in(i-2,j+3) * (-0.010416666666666666d0) & + + in(i-1,j+3) * (-0.0625d0) & + + in(i+1,j+3) * (0.004166666666666667d0) & + + in(i+2,j+3) * (0.004166666666666667d0) & + + in(i+3,j+3) * (0.020833333333333332d0) & + + in(i+4,j+3) * (0.002232142857142857d0) & + + in(i-4,j+4) * (-0.002232142857142857d0) & + + in(i-3,j+4) * (-0.004166666666666667d0) & + + in(i-2,j+4) * (-0.010416666666666666d0) & + + in(i-1,j+4) * (-0.0625d0) & + + in(i+1,j+4) * (0.002232142857142857d0) & + + in(i+2,j+4) * (0.002232142857142857d0) & + + in(i+3,j+4) * (0.002232142857142857d0) & + + in(i+4,j+4) * (0.015625d0) & +0.0 end do !$omp end simd @@ -535,88 +561,90 @@ subroutine grid5(n, in, out) !$omp do do i=5,n-5-1 !$omp simd + do j=5,n-5-1 + do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i-5,j-5) * (-0.01) & - + in(i+1,j-5) * (-0.0011111111111111111) & - + in(i+2,j-5) * (-0.0011111111111111111) & - + in(i+3,j-5) * (-0.0011111111111111111) & - + in(i+4,j-5) * (-0.0011111111111111111) & - + in(i+5,j-5) * (-0.0011111111111111111) & - + in(i-4,j-4) * (-0.0125) & - + in(i+1,j-4) * (-0.0017857142857142857) & - + in(i+2,j-4) * (-0.0017857142857142857) & - + in(i+3,j-4) * (-0.0017857142857142857) & - + in(i+4,j-4) * (-0.0017857142857142857) & - + in(i+5,j-4) * (-0.0017857142857142857) & - + in(i-3,j-3) * (-0.016666666666666666) & - + in(i+1,j-3) * (-0.0033333333333333335) & - + in(i+2,j-3) * (-0.0033333333333333335) & - + in(i+3,j-3) * (-0.0033333333333333335) & - + in(i+4,j-3) * (-0.0033333333333333335) & - + in(i+5,j-3) * (-0.0033333333333333335) & - + in(i-2,j-2) * (-0.025) & - + in(i+1,j-2) * (-0.008333333333333333) & - + in(i+2,j-2) * (-0.008333333333333333) & - + in(i+3,j-2) * (-0.008333333333333333) & - + in(i+4,j-2) * (-0.008333333333333333) & - + in(i+5,j-2) * (-0.008333333333333333) & - + in(i-1,j-1) * (-0.05) & - + in(i+1,j-1) * (-0.05) & - + in(i+2,j-1) * (-0.05) & - + in(i+3,j-1) * (-0.05) & - + in(i+4,j-1) * (-0.05) & - + in(i+5,j-1) * (-0.05) & - + in(i-5,j+1) * (-0.0011111111111111111) & - + in(i-4,j+1) * (-0.0017857142857142857) & - + in(i-3,j+1) * (-0.0033333333333333335) & - + in(i-2,j+1) * (-0.008333333333333333) & - + in(i-1,j+1) * (-0.05) & - + in(i+1,j+1) * (0.05) & - + in(i+2,j+1) * (0.008333333333333333) & - + in(i+3,j+1) * (0.0033333333333333335) & - + in(i+4,j+1) * (0.0017857142857142857) & - + in(i+5,j+1) * (0.0011111111111111111) & - + in(i-5,j+2) * (-0.0011111111111111111) & - + in(i-4,j+2) * (-0.0017857142857142857) & - + in(i-3,j+2) * (-0.0033333333333333335) & - + in(i-2,j+2) * (-0.008333333333333333) & - + in(i-1,j+2) * (-0.05) & - + in(i+1,j+2) * (0.008333333333333333) & - + in(i+2,j+2) * (0.025) & - + in(i+3,j+2) * (0.0033333333333333335) & - + in(i+4,j+2) * (0.0017857142857142857) & - + in(i+5,j+2) * (0.0011111111111111111) & - + in(i-5,j+3) * (-0.0011111111111111111) & - + in(i-4,j+3) * (-0.0017857142857142857) & - + in(i-3,j+3) * (-0.0033333333333333335) & - + in(i-2,j+3) * (-0.008333333333333333) & - + in(i-1,j+3) * (-0.05) & - + in(i+1,j+3) * (0.0033333333333333335) & - + in(i+2,j+3) * (0.0033333333333333335) & - + in(i+3,j+3) * (0.016666666666666666) & - + in(i+4,j+3) * (0.0017857142857142857) & - + in(i+5,j+3) * (0.0011111111111111111) & - + in(i-5,j+4) * (-0.0011111111111111111) & - + in(i-4,j+4) * (-0.0017857142857142857) & - + in(i-3,j+4) * (-0.0033333333333333335) & - + in(i-2,j+4) * (-0.008333333333333333) & - + in(i-1,j+4) * (-0.05) & - + in(i+1,j+4) * (0.0017857142857142857) & - + in(i+2,j+4) * (0.0017857142857142857) & - + in(i+3,j+4) * (0.0017857142857142857) & - + in(i+4,j+4) * (0.0125) & - + in(i+5,j+4) * (0.0011111111111111111) & - + in(i-5,j+5) * (-0.0011111111111111111) & - + in(i-4,j+5) * (-0.0017857142857142857) & - + in(i-3,j+5) * (-0.0033333333333333335) & - + in(i-2,j+5) * (-0.008333333333333333) & - + in(i-1,j+5) * (-0.05) & - + in(i+1,j+5) * (0.0011111111111111111) & - + in(i+2,j+5) * (0.0011111111111111111) & - + in(i+3,j+5) * (0.0011111111111111111) & - + in(i+4,j+5) * (0.0011111111111111111) & - + in(i+5,j+5) * (0.01) & + + in(i-5,j-5) * (-0.01d0) & + + in(i+1,j-5) * (-0.0011111111111111111d0) & + + in(i+2,j-5) * (-0.0011111111111111111d0) & + + in(i+3,j-5) * (-0.0011111111111111111d0) & + + in(i+4,j-5) * (-0.0011111111111111111d0) & + + in(i+5,j-5) * (-0.0011111111111111111d0) & + + in(i-4,j-4) * (-0.0125d0) & + + in(i+1,j-4) * (-0.0017857142857142857d0) & + + in(i+2,j-4) * (-0.0017857142857142857d0) & + + in(i+3,j-4) * (-0.0017857142857142857d0) & + + in(i+4,j-4) * (-0.0017857142857142857d0) & + + in(i+5,j-4) * (-0.0017857142857142857d0) & + + in(i-3,j-3) * (-0.016666666666666666d0) & + + in(i+1,j-3) * (-0.0033333333333333335d0) & + + in(i+2,j-3) * (-0.0033333333333333335d0) & + + in(i+3,j-3) * (-0.0033333333333333335d0) & + + in(i+4,j-3) * (-0.0033333333333333335d0) & + + in(i+5,j-3) * (-0.0033333333333333335d0) & + + in(i-2,j-2) * (-0.025d0) & + + in(i+1,j-2) * (-0.008333333333333333d0) & + + in(i+2,j-2) * (-0.008333333333333333d0) & + + in(i+3,j-2) * (-0.008333333333333333d0) & + + in(i+4,j-2) * (-0.008333333333333333d0) & + + in(i+5,j-2) * (-0.008333333333333333d0) & + + in(i-1,j-1) * (-0.05d0) & + + in(i+1,j-1) * (-0.05d0) & + + in(i+2,j-1) * (-0.05d0) & + + in(i+3,j-1) * (-0.05d0) & + + in(i+4,j-1) * (-0.05d0) & + + in(i+5,j-1) * (-0.05d0) & + + in(i-5,j+1) * (-0.0011111111111111111d0) & + + in(i-4,j+1) * (-0.0017857142857142857d0) & + + in(i-3,j+1) * (-0.0033333333333333335d0) & + + in(i-2,j+1) * (-0.008333333333333333d0) & + + in(i-1,j+1) * (-0.05d0) & + + in(i+1,j+1) * (0.05d0) & + + in(i+2,j+1) * (0.008333333333333333d0) & + + in(i+3,j+1) * (0.0033333333333333335d0) & + + in(i+4,j+1) * (0.0017857142857142857d0) & + + in(i+5,j+1) * (0.0011111111111111111d0) & + + in(i-5,j+2) * (-0.0011111111111111111d0) & + + in(i-4,j+2) * (-0.0017857142857142857d0) & + + in(i-3,j+2) * (-0.0033333333333333335d0) & + + in(i-2,j+2) * (-0.008333333333333333d0) & + + in(i-1,j+2) * (-0.05d0) & + + in(i+1,j+2) * (0.008333333333333333d0) & + + in(i+2,j+2) * (0.025d0) & + + in(i+3,j+2) * (0.0033333333333333335d0) & + + in(i+4,j+2) * (0.0017857142857142857d0) & + + in(i+5,j+2) * (0.0011111111111111111d0) & + + in(i-5,j+3) * (-0.0011111111111111111d0) & + + in(i-4,j+3) * (-0.0017857142857142857d0) & + + in(i-3,j+3) * (-0.0033333333333333335d0) & + + in(i-2,j+3) * (-0.008333333333333333d0) & + + in(i-1,j+3) * (-0.05d0) & + + in(i+1,j+3) * (0.0033333333333333335d0) & + + in(i+2,j+3) * (0.0033333333333333335d0) & + + in(i+3,j+3) * (0.016666666666666666d0) & + + in(i+4,j+3) * (0.0017857142857142857d0) & + + in(i+5,j+3) * (0.0011111111111111111d0) & + + in(i-5,j+4) * (-0.0011111111111111111d0) & + + in(i-4,j+4) * (-0.0017857142857142857d0) & + + in(i-3,j+4) * (-0.0033333333333333335d0) & + + in(i-2,j+4) * (-0.008333333333333333d0) & + + in(i-1,j+4) * (-0.05d0) & + + in(i+1,j+4) * (0.0017857142857142857d0) & + + in(i+2,j+4) * (0.0017857142857142857d0) & + + in(i+3,j+4) * (0.0017857142857142857d0) & + + in(i+4,j+4) * (0.0125d0) & + + in(i+5,j+4) * (0.0011111111111111111d0) & + + in(i-5,j+5) * (-0.0011111111111111111d0) & + + in(i-4,j+5) * (-0.0017857142857142857d0) & + + in(i-3,j+5) * (-0.0033333333333333335d0) & + + in(i-2,j+5) * (-0.008333333333333333d0) & + + in(i-1,j+5) * (-0.05d0) & + + in(i+1,j+5) * (0.0011111111111111111d0) & + + in(i+2,j+5) * (0.0011111111111111111d0) & + + in(i+3,j+5) * (0.0011111111111111111d0) & + + in(i+4,j+5) * (0.0011111111111111111d0) & + + in(i+5,j+5) * (0.01d0) & +0.0 end do !$omp end simd @@ -634,122 +662,124 @@ subroutine grid6(n, in, out) !$omp do do i=6,n-6-1 !$omp simd + do j=6,n-6-1 + do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i-6,j-6) * (-0.006944444444444444) & - + in(i+1,j-6) * (-0.0006313131313131314) & - + in(i+2,j-6) * (-0.0006313131313131314) & - + in(i+3,j-6) * (-0.0006313131313131314) & - + in(i+4,j-6) * (-0.0006313131313131314) & - + in(i+5,j-6) * (-0.0006313131313131314) & - + in(i+6,j-6) * (-0.0006313131313131314) & - + in(i-5,j-5) * (-0.008333333333333333) & - + in(i+1,j-5) * (-0.000925925925925926) & - + in(i+2,j-5) * (-0.000925925925925926) & - + in(i+3,j-5) * (-0.000925925925925926) & - + in(i+4,j-5) * (-0.000925925925925926) & - + in(i+5,j-5) * (-0.000925925925925926) & - + in(i+6,j-5) * (-0.000925925925925926) & - + in(i-4,j-4) * (-0.010416666666666666) & - + in(i+1,j-4) * (-0.001488095238095238) & - + in(i+2,j-4) * (-0.001488095238095238) & - + in(i+3,j-4) * (-0.001488095238095238) & - + in(i+4,j-4) * (-0.001488095238095238) & - + in(i+5,j-4) * (-0.001488095238095238) & - + in(i+6,j-4) * (-0.001488095238095238) & - + in(i-3,j-3) * (-0.013888888888888888) & - + in(i+1,j-3) * (-0.002777777777777778) & - + in(i+2,j-3) * (-0.002777777777777778) & - + in(i+3,j-3) * (-0.002777777777777778) & - + in(i+4,j-3) * (-0.002777777777777778) & - + in(i+5,j-3) * (-0.002777777777777778) & - + in(i+6,j-3) * (-0.002777777777777778) & - + in(i-2,j-2) * (-0.020833333333333332) & - + in(i+1,j-2) * (-0.006944444444444444) & - + in(i+2,j-2) * (-0.006944444444444444) & - + in(i+3,j-2) * (-0.006944444444444444) & - + in(i+4,j-2) * (-0.006944444444444444) & - + in(i+5,j-2) * (-0.006944444444444444) & - + in(i+6,j-2) * (-0.006944444444444444) & - + in(i-1,j-1) * (-0.041666666666666664) & - + in(i+1,j-1) * (-0.041666666666666664) & - + in(i+2,j-1) * (-0.041666666666666664) & - + in(i+3,j-1) * (-0.041666666666666664) & - + in(i+4,j-1) * (-0.041666666666666664) & - + in(i+5,j-1) * (-0.041666666666666664) & - + in(i+6,j-1) * (-0.041666666666666664) & - + in(i-6,j+1) * (-0.0006313131313131314) & - + in(i-5,j+1) * (-0.000925925925925926) & - + in(i-4,j+1) * (-0.001488095238095238) & - + in(i-3,j+1) * (-0.002777777777777778) & - + in(i-2,j+1) * (-0.006944444444444444) & - + in(i-1,j+1) * (-0.041666666666666664) & - + in(i+1,j+1) * (0.041666666666666664) & - + in(i+2,j+1) * (0.006944444444444444) & - + in(i+3,j+1) * (0.002777777777777778) & - + in(i+4,j+1) * (0.001488095238095238) & - + in(i+5,j+1) * (0.000925925925925926) & - + in(i+6,j+1) * (0.0006313131313131314) & - + in(i-6,j+2) * (-0.0006313131313131314) & - + in(i-5,j+2) * (-0.000925925925925926) & - + in(i-4,j+2) * (-0.001488095238095238) & - + in(i-3,j+2) * (-0.002777777777777778) & - + in(i-2,j+2) * (-0.006944444444444444) & - + in(i-1,j+2) * (-0.041666666666666664) & - + in(i+1,j+2) * (0.006944444444444444) & - + in(i+2,j+2) * (0.020833333333333332) & - + in(i+3,j+2) * (0.002777777777777778) & - + in(i+4,j+2) * (0.001488095238095238) & - + in(i+5,j+2) * (0.000925925925925926) & - + in(i+6,j+2) * (0.0006313131313131314) & - + in(i-6,j+3) * (-0.0006313131313131314) & - + in(i-5,j+3) * (-0.000925925925925926) & - + in(i-4,j+3) * (-0.001488095238095238) & - + in(i-3,j+3) * (-0.002777777777777778) & - + in(i-2,j+3) * (-0.006944444444444444) & - + in(i-1,j+3) * (-0.041666666666666664) & - + in(i+1,j+3) * (0.002777777777777778) & - + in(i+2,j+3) * (0.002777777777777778) & - + in(i+3,j+3) * (0.013888888888888888) & - + in(i+4,j+3) * (0.001488095238095238) & - + in(i+5,j+3) * (0.000925925925925926) & - + in(i+6,j+3) * (0.0006313131313131314) & - + in(i-6,j+4) * (-0.0006313131313131314) & - + in(i-5,j+4) * (-0.000925925925925926) & - + in(i-4,j+4) * (-0.001488095238095238) & - + in(i-3,j+4) * (-0.002777777777777778) & - + in(i-2,j+4) * (-0.006944444444444444) & - + in(i-1,j+4) * (-0.041666666666666664) & - + in(i+1,j+4) * (0.001488095238095238) & - + in(i+2,j+4) * (0.001488095238095238) & - + in(i+3,j+4) * (0.001488095238095238) & - + in(i+4,j+4) * (0.010416666666666666) & - + in(i+5,j+4) * (0.000925925925925926) & - + in(i+6,j+4) * (0.0006313131313131314) & - + in(i-6,j+5) * (-0.0006313131313131314) & - + in(i-5,j+5) * (-0.000925925925925926) & - + in(i-4,j+5) * (-0.001488095238095238) & - + in(i-3,j+5) * (-0.002777777777777778) & - + in(i-2,j+5) * (-0.006944444444444444) & - + in(i-1,j+5) * (-0.041666666666666664) & - + in(i+1,j+5) * (0.000925925925925926) & - + in(i+2,j+5) * (0.000925925925925926) & - + in(i+3,j+5) * (0.000925925925925926) & - + in(i+4,j+5) * (0.000925925925925926) & - + in(i+5,j+5) * (0.008333333333333333) & - + in(i+6,j+5) * (0.0006313131313131314) & - + in(i-6,j+6) * (-0.0006313131313131314) & - + in(i-5,j+6) * (-0.000925925925925926) & - + in(i-4,j+6) * (-0.001488095238095238) & - + in(i-3,j+6) * (-0.002777777777777778) & - + in(i-2,j+6) * (-0.006944444444444444) & - + in(i-1,j+6) * (-0.041666666666666664) & - + in(i+1,j+6) * (0.0006313131313131314) & - + in(i+2,j+6) * (0.0006313131313131314) & - + in(i+3,j+6) * (0.0006313131313131314) & - + in(i+4,j+6) * (0.0006313131313131314) & - + in(i+5,j+6) * (0.0006313131313131314) & - + in(i+6,j+6) * (0.006944444444444444) & + + in(i-6,j-6) * (-0.006944444444444444d0) & + + in(i+1,j-6) * (-0.0006313131313131314d0) & + + in(i+2,j-6) * (-0.0006313131313131314d0) & + + in(i+3,j-6) * (-0.0006313131313131314d0) & + + in(i+4,j-6) * (-0.0006313131313131314d0) & + + in(i+5,j-6) * (-0.0006313131313131314d0) & + + in(i+6,j-6) * (-0.0006313131313131314d0) & + + in(i-5,j-5) * (-0.008333333333333333d0) & + + in(i+1,j-5) * (-0.000925925925925926d0) & + + in(i+2,j-5) * (-0.000925925925925926d0) & + + in(i+3,j-5) * (-0.000925925925925926d0) & + + in(i+4,j-5) * (-0.000925925925925926d0) & + + in(i+5,j-5) * (-0.000925925925925926d0) & + + in(i+6,j-5) * (-0.000925925925925926d0) & + + in(i-4,j-4) * (-0.010416666666666666d0) & + + in(i+1,j-4) * (-0.001488095238095238d0) & + + in(i+2,j-4) * (-0.001488095238095238d0) & + + in(i+3,j-4) * (-0.001488095238095238d0) & + + in(i+4,j-4) * (-0.001488095238095238d0) & + + in(i+5,j-4) * (-0.001488095238095238d0) & + + in(i+6,j-4) * (-0.001488095238095238d0) & + + in(i-3,j-3) * (-0.013888888888888888d0) & + + in(i+1,j-3) * (-0.002777777777777778d0) & + + in(i+2,j-3) * (-0.002777777777777778d0) & + + in(i+3,j-3) * (-0.002777777777777778d0) & + + in(i+4,j-3) * (-0.002777777777777778d0) & + + in(i+5,j-3) * (-0.002777777777777778d0) & + + in(i+6,j-3) * (-0.002777777777777778d0) & + + in(i-2,j-2) * (-0.020833333333333332d0) & + + in(i+1,j-2) * (-0.006944444444444444d0) & + + in(i+2,j-2) * (-0.006944444444444444d0) & + + in(i+3,j-2) * (-0.006944444444444444d0) & + + in(i+4,j-2) * (-0.006944444444444444d0) & + + in(i+5,j-2) * (-0.006944444444444444d0) & + + in(i+6,j-2) * (-0.006944444444444444d0) & + + in(i-1,j-1) * (-0.041666666666666664d0) & + + in(i+1,j-1) * (-0.041666666666666664d0) & + + in(i+2,j-1) * (-0.041666666666666664d0) & + + in(i+3,j-1) * (-0.041666666666666664d0) & + + in(i+4,j-1) * (-0.041666666666666664d0) & + + in(i+5,j-1) * (-0.041666666666666664d0) & + + in(i+6,j-1) * (-0.041666666666666664d0) & + + in(i-6,j+1) * (-0.0006313131313131314d0) & + + in(i-5,j+1) * (-0.000925925925925926d0) & + + in(i-4,j+1) * (-0.001488095238095238d0) & + + in(i-3,j+1) * (-0.002777777777777778d0) & + + in(i-2,j+1) * (-0.006944444444444444d0) & + + in(i-1,j+1) * (-0.041666666666666664d0) & + + in(i+1,j+1) * (0.041666666666666664d0) & + + in(i+2,j+1) * (0.006944444444444444d0) & + + in(i+3,j+1) * (0.002777777777777778d0) & + + in(i+4,j+1) * (0.001488095238095238d0) & + + in(i+5,j+1) * (0.000925925925925926d0) & + + in(i+6,j+1) * (0.0006313131313131314d0) & + + in(i-6,j+2) * (-0.0006313131313131314d0) & + + in(i-5,j+2) * (-0.000925925925925926d0) & + + in(i-4,j+2) * (-0.001488095238095238d0) & + + in(i-3,j+2) * (-0.002777777777777778d0) & + + in(i-2,j+2) * (-0.006944444444444444d0) & + + in(i-1,j+2) * (-0.041666666666666664d0) & + + in(i+1,j+2) * (0.006944444444444444d0) & + + in(i+2,j+2) * (0.020833333333333332d0) & + + in(i+3,j+2) * (0.002777777777777778d0) & + + in(i+4,j+2) * (0.001488095238095238d0) & + + in(i+5,j+2) * (0.000925925925925926d0) & + + in(i+6,j+2) * (0.0006313131313131314d0) & + + in(i-6,j+3) * (-0.0006313131313131314d0) & + + in(i-5,j+3) * (-0.000925925925925926d0) & + + in(i-4,j+3) * (-0.001488095238095238d0) & + + in(i-3,j+3) * (-0.002777777777777778d0) & + + in(i-2,j+3) * (-0.006944444444444444d0) & + + in(i-1,j+3) * (-0.041666666666666664d0) & + + in(i+1,j+3) * (0.002777777777777778d0) & + + in(i+2,j+3) * (0.002777777777777778d0) & + + in(i+3,j+3) * (0.013888888888888888d0) & + + in(i+4,j+3) * (0.001488095238095238d0) & + + in(i+5,j+3) * (0.000925925925925926d0) & + + in(i+6,j+3) * (0.0006313131313131314d0) & + + in(i-6,j+4) * (-0.0006313131313131314d0) & + + in(i-5,j+4) * (-0.000925925925925926d0) & + + in(i-4,j+4) * (-0.001488095238095238d0) & + + in(i-3,j+4) * (-0.002777777777777778d0) & + + in(i-2,j+4) * (-0.006944444444444444d0) & + + in(i-1,j+4) * (-0.041666666666666664d0) & + + in(i+1,j+4) * (0.001488095238095238d0) & + + in(i+2,j+4) * (0.001488095238095238d0) & + + in(i+3,j+4) * (0.001488095238095238d0) & + + in(i+4,j+4) * (0.010416666666666666d0) & + + in(i+5,j+4) * (0.000925925925925926d0) & + + in(i+6,j+4) * (0.0006313131313131314d0) & + + in(i-6,j+5) * (-0.0006313131313131314d0) & + + in(i-5,j+5) * (-0.000925925925925926d0) & + + in(i-4,j+5) * (-0.001488095238095238d0) & + + in(i-3,j+5) * (-0.002777777777777778d0) & + + in(i-2,j+5) * (-0.006944444444444444d0) & + + in(i-1,j+5) * (-0.041666666666666664d0) & + + in(i+1,j+5) * (0.000925925925925926d0) & + + in(i+2,j+5) * (0.000925925925925926d0) & + + in(i+3,j+5) * (0.000925925925925926d0) & + + in(i+4,j+5) * (0.000925925925925926d0) & + + in(i+5,j+5) * (0.008333333333333333d0) & + + in(i+6,j+5) * (0.0006313131313131314d0) & + + in(i-6,j+6) * (-0.0006313131313131314d0) & + + in(i-5,j+6) * (-0.000925925925925926d0) & + + in(i-4,j+6) * (-0.001488095238095238d0) & + + in(i-3,j+6) * (-0.002777777777777778d0) & + + in(i-2,j+6) * (-0.006944444444444444d0) & + + in(i-1,j+6) * (-0.041666666666666664d0) & + + in(i+1,j+6) * (0.0006313131313131314d0) & + + in(i+2,j+6) * (0.0006313131313131314d0) & + + in(i+3,j+6) * (0.0006313131313131314d0) & + + in(i+4,j+6) * (0.0006313131313131314d0) & + + in(i+5,j+6) * (0.0006313131313131314d0) & + + in(i+6,j+6) * (0.006944444444444444d0) & +0.0 end do !$omp end simd @@ -767,162 +797,164 @@ subroutine grid7(n, in, out) !$omp do do i=7,n-7-1 !$omp simd + do j=7,n-7-1 + do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i-7,j-7) * (-0.00510204081632653) & - + in(i+1,j-7) * (-0.0003924646781789639) & - + in(i+2,j-7) * (-0.0003924646781789639) & - + in(i+3,j-7) * (-0.0003924646781789639) & - + in(i+4,j-7) * (-0.0003924646781789639) & - + in(i+5,j-7) * (-0.0003924646781789639) & - + in(i+6,j-7) * (-0.0003924646781789639) & - + in(i+7,j-7) * (-0.0003924646781789639) & - + in(i-6,j-6) * (-0.005952380952380952) & - + in(i+1,j-6) * (-0.0005411255411255411) & - + in(i+2,j-6) * (-0.0005411255411255411) & - + in(i+3,j-6) * (-0.0005411255411255411) & - + in(i+4,j-6) * (-0.0005411255411255411) & - + in(i+5,j-6) * (-0.0005411255411255411) & - + in(i+6,j-6) * (-0.0005411255411255411) & - + in(i+7,j-6) * (-0.0005411255411255411) & - + in(i-5,j-5) * (-0.007142857142857143) & - + in(i+1,j-5) * (-0.0007936507936507937) & - + in(i+2,j-5) * (-0.0007936507936507937) & - + in(i+3,j-5) * (-0.0007936507936507937) & - + in(i+4,j-5) * (-0.0007936507936507937) & - + in(i+5,j-5) * (-0.0007936507936507937) & - + in(i+6,j-5) * (-0.0007936507936507937) & - + in(i+7,j-5) * (-0.0007936507936507937) & - + in(i-4,j-4) * (-0.008928571428571428) & - + in(i+1,j-4) * (-0.0012755102040816326) & - + in(i+2,j-4) * (-0.0012755102040816326) & - + in(i+3,j-4) * (-0.0012755102040816326) & - + in(i+4,j-4) * (-0.0012755102040816326) & - + in(i+5,j-4) * (-0.0012755102040816326) & - + in(i+6,j-4) * (-0.0012755102040816326) & - + in(i+7,j-4) * (-0.0012755102040816326) & - + in(i-3,j-3) * (-0.011904761904761904) & - + in(i+1,j-3) * (-0.002380952380952381) & - + in(i+2,j-3) * (-0.002380952380952381) & - + in(i+3,j-3) * (-0.002380952380952381) & - + in(i+4,j-3) * (-0.002380952380952381) & - + in(i+5,j-3) * (-0.002380952380952381) & - + in(i+6,j-3) * (-0.002380952380952381) & - + in(i+7,j-3) * (-0.002380952380952381) & - + in(i-2,j-2) * (-0.017857142857142856) & - + in(i+1,j-2) * (-0.005952380952380952) & - + in(i+2,j-2) * (-0.005952380952380952) & - + in(i+3,j-2) * (-0.005952380952380952) & - + in(i+4,j-2) * (-0.005952380952380952) & - + in(i+5,j-2) * (-0.005952380952380952) & - + in(i+6,j-2) * (-0.005952380952380952) & - + in(i+7,j-2) * (-0.005952380952380952) & - + in(i-1,j-1) * (-0.03571428571428571) & - + in(i+1,j-1) * (-0.03571428571428571) & - + in(i+2,j-1) * (-0.03571428571428571) & - + in(i+3,j-1) * (-0.03571428571428571) & - + in(i+4,j-1) * (-0.03571428571428571) & - + in(i+5,j-1) * (-0.03571428571428571) & - + in(i+6,j-1) * (-0.03571428571428571) & - + in(i+7,j-1) * (-0.03571428571428571) & - + in(i-7,j+1) * (-0.0003924646781789639) & - + in(i-6,j+1) * (-0.0005411255411255411) & - + in(i-5,j+1) * (-0.0007936507936507937) & - + in(i-4,j+1) * (-0.0012755102040816326) & - + in(i-3,j+1) * (-0.002380952380952381) & - + in(i-2,j+1) * (-0.005952380952380952) & - + in(i-1,j+1) * (-0.03571428571428571) & - + in(i+1,j+1) * (0.03571428571428571) & - + in(i+2,j+1) * (0.005952380952380952) & - + in(i+3,j+1) * (0.002380952380952381) & - + in(i+4,j+1) * (0.0012755102040816326) & - + in(i+5,j+1) * (0.0007936507936507937) & - + in(i+6,j+1) * (0.0005411255411255411) & - + in(i+7,j+1) * (0.0003924646781789639) & - + in(i-7,j+2) * (-0.0003924646781789639) & - + in(i-6,j+2) * (-0.0005411255411255411) & - + in(i-5,j+2) * (-0.0007936507936507937) & - + in(i-4,j+2) * (-0.0012755102040816326) & - + in(i-3,j+2) * (-0.002380952380952381) & - + in(i-2,j+2) * (-0.005952380952380952) & - + in(i-1,j+2) * (-0.03571428571428571) & - + in(i+1,j+2) * (0.005952380952380952) & - + in(i+2,j+2) * (0.017857142857142856) & - + in(i+3,j+2) * (0.002380952380952381) & - + in(i+4,j+2) * (0.0012755102040816326) & - + in(i+5,j+2) * (0.0007936507936507937) & - + in(i+6,j+2) * (0.0005411255411255411) & - + in(i+7,j+2) * (0.0003924646781789639) & - + in(i-7,j+3) * (-0.0003924646781789639) & - + in(i-6,j+3) * (-0.0005411255411255411) & - + in(i-5,j+3) * (-0.0007936507936507937) & - + in(i-4,j+3) * (-0.0012755102040816326) & - + in(i-3,j+3) * (-0.002380952380952381) & - + in(i-2,j+3) * (-0.005952380952380952) & - + in(i-1,j+3) * (-0.03571428571428571) & - + in(i+1,j+3) * (0.002380952380952381) & - + in(i+2,j+3) * (0.002380952380952381) & - + in(i+3,j+3) * (0.011904761904761904) & - + in(i+4,j+3) * (0.0012755102040816326) & - + in(i+5,j+3) * (0.0007936507936507937) & - + in(i+6,j+3) * (0.0005411255411255411) & - + in(i+7,j+3) * (0.0003924646781789639) & - + in(i-7,j+4) * (-0.0003924646781789639) & - + in(i-6,j+4) * (-0.0005411255411255411) & - + in(i-5,j+4) * (-0.0007936507936507937) & - + in(i-4,j+4) * (-0.0012755102040816326) & - + in(i-3,j+4) * (-0.002380952380952381) & - + in(i-2,j+4) * (-0.005952380952380952) & - + in(i-1,j+4) * (-0.03571428571428571) & - + in(i+1,j+4) * (0.0012755102040816326) & - + in(i+2,j+4) * (0.0012755102040816326) & - + in(i+3,j+4) * (0.0012755102040816326) & - + in(i+4,j+4) * (0.008928571428571428) & - + in(i+5,j+4) * (0.0007936507936507937) & - + in(i+6,j+4) * (0.0005411255411255411) & - + in(i+7,j+4) * (0.0003924646781789639) & - + in(i-7,j+5) * (-0.0003924646781789639) & - + in(i-6,j+5) * (-0.0005411255411255411) & - + in(i-5,j+5) * (-0.0007936507936507937) & - + in(i-4,j+5) * (-0.0012755102040816326) & - + in(i-3,j+5) * (-0.002380952380952381) & - + in(i-2,j+5) * (-0.005952380952380952) & - + in(i-1,j+5) * (-0.03571428571428571) & - + in(i+1,j+5) * (0.0007936507936507937) & - + in(i+2,j+5) * (0.0007936507936507937) & - + in(i+3,j+5) * (0.0007936507936507937) & - + in(i+4,j+5) * (0.0007936507936507937) & - + in(i+5,j+5) * (0.007142857142857143) & - + in(i+6,j+5) * (0.0005411255411255411) & - + in(i+7,j+5) * (0.0003924646781789639) & - + in(i-7,j+6) * (-0.0003924646781789639) & - + in(i-6,j+6) * (-0.0005411255411255411) & - + in(i-5,j+6) * (-0.0007936507936507937) & - + in(i-4,j+6) * (-0.0012755102040816326) & - + in(i-3,j+6) * (-0.002380952380952381) & - + in(i-2,j+6) * (-0.005952380952380952) & - + in(i-1,j+6) * (-0.03571428571428571) & - + in(i+1,j+6) * (0.0005411255411255411) & - + in(i+2,j+6) * (0.0005411255411255411) & - + in(i+3,j+6) * (0.0005411255411255411) & - + in(i+4,j+6) * (0.0005411255411255411) & - + in(i+5,j+6) * (0.0005411255411255411) & - + in(i+6,j+6) * (0.005952380952380952) & - + in(i+7,j+6) * (0.0003924646781789639) & - + in(i-7,j+7) * (-0.0003924646781789639) & - + in(i-6,j+7) * (-0.0005411255411255411) & - + in(i-5,j+7) * (-0.0007936507936507937) & - + in(i-4,j+7) * (-0.0012755102040816326) & - + in(i-3,j+7) * (-0.002380952380952381) & - + in(i-2,j+7) * (-0.005952380952380952) & - + in(i-1,j+7) * (-0.03571428571428571) & - + in(i+1,j+7) * (0.0003924646781789639) & - + in(i+2,j+7) * (0.0003924646781789639) & - + in(i+3,j+7) * (0.0003924646781789639) & - + in(i+4,j+7) * (0.0003924646781789639) & - + in(i+5,j+7) * (0.0003924646781789639) & - + in(i+6,j+7) * (0.0003924646781789639) & - + in(i+7,j+7) * (0.00510204081632653) & + + in(i-7,j-7) * (-0.00510204081632653d0) & + + in(i+1,j-7) * (-0.0003924646781789639d0) & + + in(i+2,j-7) * (-0.0003924646781789639d0) & + + in(i+3,j-7) * (-0.0003924646781789639d0) & + + in(i+4,j-7) * (-0.0003924646781789639d0) & + + in(i+5,j-7) * (-0.0003924646781789639d0) & + + in(i+6,j-7) * (-0.0003924646781789639d0) & + + in(i+7,j-7) * (-0.0003924646781789639d0) & + + in(i-6,j-6) * (-0.005952380952380952d0) & + + in(i+1,j-6) * (-0.0005411255411255411d0) & + + in(i+2,j-6) * (-0.0005411255411255411d0) & + + in(i+3,j-6) * (-0.0005411255411255411d0) & + + in(i+4,j-6) * (-0.0005411255411255411d0) & + + in(i+5,j-6) * (-0.0005411255411255411d0) & + + in(i+6,j-6) * (-0.0005411255411255411d0) & + + in(i+7,j-6) * (-0.0005411255411255411d0) & + + in(i-5,j-5) * (-0.007142857142857143d0) & + + in(i+1,j-5) * (-0.0007936507936507937d0) & + + in(i+2,j-5) * (-0.0007936507936507937d0) & + + in(i+3,j-5) * (-0.0007936507936507937d0) & + + in(i+4,j-5) * (-0.0007936507936507937d0) & + + in(i+5,j-5) * (-0.0007936507936507937d0) & + + in(i+6,j-5) * (-0.0007936507936507937d0) & + + in(i+7,j-5) * (-0.0007936507936507937d0) & + + in(i-4,j-4) * (-0.008928571428571428d0) & + + in(i+1,j-4) * (-0.0012755102040816326d0) & + + in(i+2,j-4) * (-0.0012755102040816326d0) & + + in(i+3,j-4) * (-0.0012755102040816326d0) & + + in(i+4,j-4) * (-0.0012755102040816326d0) & + + in(i+5,j-4) * (-0.0012755102040816326d0) & + + in(i+6,j-4) * (-0.0012755102040816326d0) & + + in(i+7,j-4) * (-0.0012755102040816326d0) & + + in(i-3,j-3) * (-0.011904761904761904d0) & + + in(i+1,j-3) * (-0.002380952380952381d0) & + + in(i+2,j-3) * (-0.002380952380952381d0) & + + in(i+3,j-3) * (-0.002380952380952381d0) & + + in(i+4,j-3) * (-0.002380952380952381d0) & + + in(i+5,j-3) * (-0.002380952380952381d0) & + + in(i+6,j-3) * (-0.002380952380952381d0) & + + in(i+7,j-3) * (-0.002380952380952381d0) & + + in(i-2,j-2) * (-0.017857142857142856d0) & + + in(i+1,j-2) * (-0.005952380952380952d0) & + + in(i+2,j-2) * (-0.005952380952380952d0) & + + in(i+3,j-2) * (-0.005952380952380952d0) & + + in(i+4,j-2) * (-0.005952380952380952d0) & + + in(i+5,j-2) * (-0.005952380952380952d0) & + + in(i+6,j-2) * (-0.005952380952380952d0) & + + in(i+7,j-2) * (-0.005952380952380952d0) & + + in(i-1,j-1) * (-0.03571428571428571d0) & + + in(i+1,j-1) * (-0.03571428571428571d0) & + + in(i+2,j-1) * (-0.03571428571428571d0) & + + in(i+3,j-1) * (-0.03571428571428571d0) & + + in(i+4,j-1) * (-0.03571428571428571d0) & + + in(i+5,j-1) * (-0.03571428571428571d0) & + + in(i+6,j-1) * (-0.03571428571428571d0) & + + in(i+7,j-1) * (-0.03571428571428571d0) & + + in(i-7,j+1) * (-0.0003924646781789639d0) & + + in(i-6,j+1) * (-0.0005411255411255411d0) & + + in(i-5,j+1) * (-0.0007936507936507937d0) & + + in(i-4,j+1) * (-0.0012755102040816326d0) & + + in(i-3,j+1) * (-0.002380952380952381d0) & + + in(i-2,j+1) * (-0.005952380952380952d0) & + + in(i-1,j+1) * (-0.03571428571428571d0) & + + in(i+1,j+1) * (0.03571428571428571d0) & + + in(i+2,j+1) * (0.005952380952380952d0) & + + in(i+3,j+1) * (0.002380952380952381d0) & + + in(i+4,j+1) * (0.0012755102040816326d0) & + + in(i+5,j+1) * (0.0007936507936507937d0) & + + in(i+6,j+1) * (0.0005411255411255411d0) & + + in(i+7,j+1) * (0.0003924646781789639d0) & + + in(i-7,j+2) * (-0.0003924646781789639d0) & + + in(i-6,j+2) * (-0.0005411255411255411d0) & + + in(i-5,j+2) * (-0.0007936507936507937d0) & + + in(i-4,j+2) * (-0.0012755102040816326d0) & + + in(i-3,j+2) * (-0.002380952380952381d0) & + + in(i-2,j+2) * (-0.005952380952380952d0) & + + in(i-1,j+2) * (-0.03571428571428571d0) & + + in(i+1,j+2) * (0.005952380952380952d0) & + + in(i+2,j+2) * (0.017857142857142856d0) & + + in(i+3,j+2) * (0.002380952380952381d0) & + + in(i+4,j+2) * (0.0012755102040816326d0) & + + in(i+5,j+2) * (0.0007936507936507937d0) & + + in(i+6,j+2) * (0.0005411255411255411d0) & + + in(i+7,j+2) * (0.0003924646781789639d0) & + + in(i-7,j+3) * (-0.0003924646781789639d0) & + + in(i-6,j+3) * (-0.0005411255411255411d0) & + + in(i-5,j+3) * (-0.0007936507936507937d0) & + + in(i-4,j+3) * (-0.0012755102040816326d0) & + + in(i-3,j+3) * (-0.002380952380952381d0) & + + in(i-2,j+3) * (-0.005952380952380952d0) & + + in(i-1,j+3) * (-0.03571428571428571d0) & + + in(i+1,j+3) * (0.002380952380952381d0) & + + in(i+2,j+3) * (0.002380952380952381d0) & + + in(i+3,j+3) * (0.011904761904761904d0) & + + in(i+4,j+3) * (0.0012755102040816326d0) & + + in(i+5,j+3) * (0.0007936507936507937d0) & + + in(i+6,j+3) * (0.0005411255411255411d0) & + + in(i+7,j+3) * (0.0003924646781789639d0) & + + in(i-7,j+4) * (-0.0003924646781789639d0) & + + in(i-6,j+4) * (-0.0005411255411255411d0) & + + in(i-5,j+4) * (-0.0007936507936507937d0) & + + in(i-4,j+4) * (-0.0012755102040816326d0) & + + in(i-3,j+4) * (-0.002380952380952381d0) & + + in(i-2,j+4) * (-0.005952380952380952d0) & + + in(i-1,j+4) * (-0.03571428571428571d0) & + + in(i+1,j+4) * (0.0012755102040816326d0) & + + in(i+2,j+4) * (0.0012755102040816326d0) & + + in(i+3,j+4) * (0.0012755102040816326d0) & + + in(i+4,j+4) * (0.008928571428571428d0) & + + in(i+5,j+4) * (0.0007936507936507937d0) & + + in(i+6,j+4) * (0.0005411255411255411d0) & + + in(i+7,j+4) * (0.0003924646781789639d0) & + + in(i-7,j+5) * (-0.0003924646781789639d0) & + + in(i-6,j+5) * (-0.0005411255411255411d0) & + + in(i-5,j+5) * (-0.0007936507936507937d0) & + + in(i-4,j+5) * (-0.0012755102040816326d0) & + + in(i-3,j+5) * (-0.002380952380952381d0) & + + in(i-2,j+5) * (-0.005952380952380952d0) & + + in(i-1,j+5) * (-0.03571428571428571d0) & + + in(i+1,j+5) * (0.0007936507936507937d0) & + + in(i+2,j+5) * (0.0007936507936507937d0) & + + in(i+3,j+5) * (0.0007936507936507937d0) & + + in(i+4,j+5) * (0.0007936507936507937d0) & + + in(i+5,j+5) * (0.007142857142857143d0) & + + in(i+6,j+5) * (0.0005411255411255411d0) & + + in(i+7,j+5) * (0.0003924646781789639d0) & + + in(i-7,j+6) * (-0.0003924646781789639d0) & + + in(i-6,j+6) * (-0.0005411255411255411d0) & + + in(i-5,j+6) * (-0.0007936507936507937d0) & + + in(i-4,j+6) * (-0.0012755102040816326d0) & + + in(i-3,j+6) * (-0.002380952380952381d0) & + + in(i-2,j+6) * (-0.005952380952380952d0) & + + in(i-1,j+6) * (-0.03571428571428571d0) & + + in(i+1,j+6) * (0.0005411255411255411d0) & + + in(i+2,j+6) * (0.0005411255411255411d0) & + + in(i+3,j+6) * (0.0005411255411255411d0) & + + in(i+4,j+6) * (0.0005411255411255411d0) & + + in(i+5,j+6) * (0.0005411255411255411d0) & + + in(i+6,j+6) * (0.005952380952380952d0) & + + in(i+7,j+6) * (0.0003924646781789639d0) & + + in(i-7,j+7) * (-0.0003924646781789639d0) & + + in(i-6,j+7) * (-0.0005411255411255411d0) & + + in(i-5,j+7) * (-0.0007936507936507937d0) & + + in(i-4,j+7) * (-0.0012755102040816326d0) & + + in(i-3,j+7) * (-0.002380952380952381d0) & + + in(i-2,j+7) * (-0.005952380952380952d0) & + + in(i-1,j+7) * (-0.03571428571428571d0) & + + in(i+1,j+7) * (0.0003924646781789639d0) & + + in(i+2,j+7) * (0.0003924646781789639d0) & + + in(i+3,j+7) * (0.0003924646781789639d0) & + + in(i+4,j+7) * (0.0003924646781789639d0) & + + in(i+5,j+7) * (0.0003924646781789639d0) & + + in(i+6,j+7) * (0.0003924646781789639d0) & + + in(i+7,j+7) * (0.00510204081632653d0) & +0.0 end do !$omp end simd @@ -940,208 +972,210 @@ subroutine grid8(n, in, out) !$omp do do i=8,n-8-1 !$omp simd + do j=8,n-8-1 + do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i-8,j-8) * (-0.00390625) & - + in(i+1,j-8) * (-0.00026041666666666666) & - + in(i+2,j-8) * (-0.00026041666666666666) & - + in(i+3,j-8) * (-0.00026041666666666666) & - + in(i+4,j-8) * (-0.00026041666666666666) & - + in(i+5,j-8) * (-0.00026041666666666666) & - + in(i+6,j-8) * (-0.00026041666666666666) & - + in(i+7,j-8) * (-0.00026041666666666666) & - + in(i+8,j-8) * (-0.00026041666666666666) & - + in(i-7,j-7) * (-0.004464285714285714) & - + in(i+1,j-7) * (-0.00034340659340659343) & - + in(i+2,j-7) * (-0.00034340659340659343) & - + in(i+3,j-7) * (-0.00034340659340659343) & - + in(i+4,j-7) * (-0.00034340659340659343) & - + in(i+5,j-7) * (-0.00034340659340659343) & - + in(i+6,j-7) * (-0.00034340659340659343) & - + in(i+7,j-7) * (-0.00034340659340659343) & - + in(i+8,j-7) * (-0.00034340659340659343) & - + in(i-6,j-6) * (-0.005208333333333333) & - + in(i+1,j-6) * (-0.0004734848484848485) & - + in(i+2,j-6) * (-0.0004734848484848485) & - + in(i+3,j-6) * (-0.0004734848484848485) & - + in(i+4,j-6) * (-0.0004734848484848485) & - + in(i+5,j-6) * (-0.0004734848484848485) & - + in(i+6,j-6) * (-0.0004734848484848485) & - + in(i+7,j-6) * (-0.0004734848484848485) & - + in(i+8,j-6) * (-0.0004734848484848485) & - + in(i-5,j-5) * (-0.00625) & - + in(i+1,j-5) * (-0.0006944444444444445) & - + in(i+2,j-5) * (-0.0006944444444444445) & - + in(i+3,j-5) * (-0.0006944444444444445) & - + in(i+4,j-5) * (-0.0006944444444444445) & - + in(i+5,j-5) * (-0.0006944444444444445) & - + in(i+6,j-5) * (-0.0006944444444444445) & - + in(i+7,j-5) * (-0.0006944444444444445) & - + in(i+8,j-5) * (-0.0006944444444444445) & - + in(i-4,j-4) * (-0.0078125) & - + in(i+1,j-4) * (-0.0011160714285714285) & - + in(i+2,j-4) * (-0.0011160714285714285) & - + in(i+3,j-4) * (-0.0011160714285714285) & - + in(i+4,j-4) * (-0.0011160714285714285) & - + in(i+5,j-4) * (-0.0011160714285714285) & - + in(i+6,j-4) * (-0.0011160714285714285) & - + in(i+7,j-4) * (-0.0011160714285714285) & - + in(i+8,j-4) * (-0.0011160714285714285) & - + in(i-3,j-3) * (-0.010416666666666666) & - + in(i+1,j-3) * (-0.0020833333333333333) & - + in(i+2,j-3) * (-0.0020833333333333333) & - + in(i+3,j-3) * (-0.0020833333333333333) & - + in(i+4,j-3) * (-0.0020833333333333333) & - + in(i+5,j-3) * (-0.0020833333333333333) & - + in(i+6,j-3) * (-0.0020833333333333333) & - + in(i+7,j-3) * (-0.0020833333333333333) & - + in(i+8,j-3) * (-0.0020833333333333333) & - + in(i-2,j-2) * (-0.015625) & - + in(i+1,j-2) * (-0.005208333333333333) & - + in(i+2,j-2) * (-0.005208333333333333) & - + in(i+3,j-2) * (-0.005208333333333333) & - + in(i+4,j-2) * (-0.005208333333333333) & - + in(i+5,j-2) * (-0.005208333333333333) & - + in(i+6,j-2) * (-0.005208333333333333) & - + in(i+7,j-2) * (-0.005208333333333333) & - + in(i+8,j-2) * (-0.005208333333333333) & - + in(i-1,j-1) * (-0.03125) & - + in(i+1,j-1) * (-0.03125) & - + in(i+2,j-1) * (-0.03125) & - + in(i+3,j-1) * (-0.03125) & - + in(i+4,j-1) * (-0.03125) & - + in(i+5,j-1) * (-0.03125) & - + in(i+6,j-1) * (-0.03125) & - + in(i+7,j-1) * (-0.03125) & - + in(i+8,j-1) * (-0.03125) & - + in(i-8,j+1) * (-0.00026041666666666666) & - + in(i-7,j+1) * (-0.00034340659340659343) & - + in(i-6,j+1) * (-0.0004734848484848485) & - + in(i-5,j+1) * (-0.0006944444444444445) & - + in(i-4,j+1) * (-0.0011160714285714285) & - + in(i-3,j+1) * (-0.0020833333333333333) & - + in(i-2,j+1) * (-0.005208333333333333) & - + in(i-1,j+1) * (-0.03125) & - + in(i+1,j+1) * (0.03125) & - + in(i+2,j+1) * (0.005208333333333333) & - + in(i+3,j+1) * (0.0020833333333333333) & - + in(i+4,j+1) * (0.0011160714285714285) & - + in(i+5,j+1) * (0.0006944444444444445) & - + in(i+6,j+1) * (0.0004734848484848485) & - + in(i+7,j+1) * (0.00034340659340659343) & - + in(i+8,j+1) * (0.00026041666666666666) & - + in(i-8,j+2) * (-0.00026041666666666666) & - + in(i-7,j+2) * (-0.00034340659340659343) & - + in(i-6,j+2) * (-0.0004734848484848485) & - + in(i-5,j+2) * (-0.0006944444444444445) & - + in(i-4,j+2) * (-0.0011160714285714285) & - + in(i-3,j+2) * (-0.0020833333333333333) & - + in(i-2,j+2) * (-0.005208333333333333) & - + in(i-1,j+2) * (-0.03125) & - + in(i+1,j+2) * (0.005208333333333333) & - + in(i+2,j+2) * (0.015625) & - + in(i+3,j+2) * (0.0020833333333333333) & - + in(i+4,j+2) * (0.0011160714285714285) & - + in(i+5,j+2) * (0.0006944444444444445) & - + in(i+6,j+2) * (0.0004734848484848485) & - + in(i+7,j+2) * (0.00034340659340659343) & - + in(i+8,j+2) * (0.00026041666666666666) & - + in(i-8,j+3) * (-0.00026041666666666666) & - + in(i-7,j+3) * (-0.00034340659340659343) & - + in(i-6,j+3) * (-0.0004734848484848485) & - + in(i-5,j+3) * (-0.0006944444444444445) & - + in(i-4,j+3) * (-0.0011160714285714285) & - + in(i-3,j+3) * (-0.0020833333333333333) & - + in(i-2,j+3) * (-0.005208333333333333) & - + in(i-1,j+3) * (-0.03125) & - + in(i+1,j+3) * (0.0020833333333333333) & - + in(i+2,j+3) * (0.0020833333333333333) & - + in(i+3,j+3) * (0.010416666666666666) & - + in(i+4,j+3) * (0.0011160714285714285) & - + in(i+5,j+3) * (0.0006944444444444445) & - + in(i+6,j+3) * (0.0004734848484848485) & - + in(i+7,j+3) * (0.00034340659340659343) & - + in(i+8,j+3) * (0.00026041666666666666) & - + in(i-8,j+4) * (-0.00026041666666666666) & - + in(i-7,j+4) * (-0.00034340659340659343) & - + in(i-6,j+4) * (-0.0004734848484848485) & - + in(i-5,j+4) * (-0.0006944444444444445) & - + in(i-4,j+4) * (-0.0011160714285714285) & - + in(i-3,j+4) * (-0.0020833333333333333) & - + in(i-2,j+4) * (-0.005208333333333333) & - + in(i-1,j+4) * (-0.03125) & - + in(i+1,j+4) * (0.0011160714285714285) & - + in(i+2,j+4) * (0.0011160714285714285) & - + in(i+3,j+4) * (0.0011160714285714285) & - + in(i+4,j+4) * (0.0078125) & - + in(i+5,j+4) * (0.0006944444444444445) & - + in(i+6,j+4) * (0.0004734848484848485) & - + in(i+7,j+4) * (0.00034340659340659343) & - + in(i+8,j+4) * (0.00026041666666666666) & - + in(i-8,j+5) * (-0.00026041666666666666) & - + in(i-7,j+5) * (-0.00034340659340659343) & - + in(i-6,j+5) * (-0.0004734848484848485) & - + in(i-5,j+5) * (-0.0006944444444444445) & - + in(i-4,j+5) * (-0.0011160714285714285) & - + in(i-3,j+5) * (-0.0020833333333333333) & - + in(i-2,j+5) * (-0.005208333333333333) & - + in(i-1,j+5) * (-0.03125) & - + in(i+1,j+5) * (0.0006944444444444445) & - + in(i+2,j+5) * (0.0006944444444444445) & - + in(i+3,j+5) * (0.0006944444444444445) & - + in(i+4,j+5) * (0.0006944444444444445) & - + in(i+5,j+5) * (0.00625) & - + in(i+6,j+5) * (0.0004734848484848485) & - + in(i+7,j+5) * (0.00034340659340659343) & - + in(i+8,j+5) * (0.00026041666666666666) & - + in(i-8,j+6) * (-0.00026041666666666666) & - + in(i-7,j+6) * (-0.00034340659340659343) & - + in(i-6,j+6) * (-0.0004734848484848485) & - + in(i-5,j+6) * (-0.0006944444444444445) & - + in(i-4,j+6) * (-0.0011160714285714285) & - + in(i-3,j+6) * (-0.0020833333333333333) & - + in(i-2,j+6) * (-0.005208333333333333) & - + in(i-1,j+6) * (-0.03125) & - + in(i+1,j+6) * (0.0004734848484848485) & - + in(i+2,j+6) * (0.0004734848484848485) & - + in(i+3,j+6) * (0.0004734848484848485) & - + in(i+4,j+6) * (0.0004734848484848485) & - + in(i+5,j+6) * (0.0004734848484848485) & - + in(i+6,j+6) * (0.005208333333333333) & - + in(i+7,j+6) * (0.00034340659340659343) & - + in(i+8,j+6) * (0.00026041666666666666) & - + in(i-8,j+7) * (-0.00026041666666666666) & - + in(i-7,j+7) * (-0.00034340659340659343) & - + in(i-6,j+7) * (-0.0004734848484848485) & - + in(i-5,j+7) * (-0.0006944444444444445) & - + in(i-4,j+7) * (-0.0011160714285714285) & - + in(i-3,j+7) * (-0.0020833333333333333) & - + in(i-2,j+7) * (-0.005208333333333333) & - + in(i-1,j+7) * (-0.03125) & - + in(i+1,j+7) * (0.00034340659340659343) & - + in(i+2,j+7) * (0.00034340659340659343) & - + in(i+3,j+7) * (0.00034340659340659343) & - + in(i+4,j+7) * (0.00034340659340659343) & - + in(i+5,j+7) * (0.00034340659340659343) & - + in(i+6,j+7) * (0.00034340659340659343) & - + in(i+7,j+7) * (0.004464285714285714) & - + in(i+8,j+7) * (0.00026041666666666666) & - + in(i-8,j+8) * (-0.00026041666666666666) & - + in(i-7,j+8) * (-0.00034340659340659343) & - + in(i-6,j+8) * (-0.0004734848484848485) & - + in(i-5,j+8) * (-0.0006944444444444445) & - + in(i-4,j+8) * (-0.0011160714285714285) & - + in(i-3,j+8) * (-0.0020833333333333333) & - + in(i-2,j+8) * (-0.005208333333333333) & - + in(i-1,j+8) * (-0.03125) & - + in(i+1,j+8) * (0.00026041666666666666) & - + in(i+2,j+8) * (0.00026041666666666666) & - + in(i+3,j+8) * (0.00026041666666666666) & - + in(i+4,j+8) * (0.00026041666666666666) & - + in(i+5,j+8) * (0.00026041666666666666) & - + in(i+6,j+8) * (0.00026041666666666666) & - + in(i+7,j+8) * (0.00026041666666666666) & - + in(i+8,j+8) * (0.00390625) & + + in(i-8,j-8) * (-0.00390625d0) & + + in(i+1,j-8) * (-0.00026041666666666666d0) & + + in(i+2,j-8) * (-0.00026041666666666666d0) & + + in(i+3,j-8) * (-0.00026041666666666666d0) & + + in(i+4,j-8) * (-0.00026041666666666666d0) & + + in(i+5,j-8) * (-0.00026041666666666666d0) & + + in(i+6,j-8) * (-0.00026041666666666666d0) & + + in(i+7,j-8) * (-0.00026041666666666666d0) & + + in(i+8,j-8) * (-0.00026041666666666666d0) & + + in(i-7,j-7) * (-0.004464285714285714d0) & + + in(i+1,j-7) * (-0.00034340659340659343d0) & + + in(i+2,j-7) * (-0.00034340659340659343d0) & + + in(i+3,j-7) * (-0.00034340659340659343d0) & + + in(i+4,j-7) * (-0.00034340659340659343d0) & + + in(i+5,j-7) * (-0.00034340659340659343d0) & + + in(i+6,j-7) * (-0.00034340659340659343d0) & + + in(i+7,j-7) * (-0.00034340659340659343d0) & + + in(i+8,j-7) * (-0.00034340659340659343d0) & + + in(i-6,j-6) * (-0.005208333333333333d0) & + + in(i+1,j-6) * (-0.0004734848484848485d0) & + + in(i+2,j-6) * (-0.0004734848484848485d0) & + + in(i+3,j-6) * (-0.0004734848484848485d0) & + + in(i+4,j-6) * (-0.0004734848484848485d0) & + + in(i+5,j-6) * (-0.0004734848484848485d0) & + + in(i+6,j-6) * (-0.0004734848484848485d0) & + + in(i+7,j-6) * (-0.0004734848484848485d0) & + + in(i+8,j-6) * (-0.0004734848484848485d0) & + + in(i-5,j-5) * (-0.00625d0) & + + in(i+1,j-5) * (-0.0006944444444444445d0) & + + in(i+2,j-5) * (-0.0006944444444444445d0) & + + in(i+3,j-5) * (-0.0006944444444444445d0) & + + in(i+4,j-5) * (-0.0006944444444444445d0) & + + in(i+5,j-5) * (-0.0006944444444444445d0) & + + in(i+6,j-5) * (-0.0006944444444444445d0) & + + in(i+7,j-5) * (-0.0006944444444444445d0) & + + in(i+8,j-5) * (-0.0006944444444444445d0) & + + in(i-4,j-4) * (-0.0078125d0) & + + in(i+1,j-4) * (-0.0011160714285714285d0) & + + in(i+2,j-4) * (-0.0011160714285714285d0) & + + in(i+3,j-4) * (-0.0011160714285714285d0) & + + in(i+4,j-4) * (-0.0011160714285714285d0) & + + in(i+5,j-4) * (-0.0011160714285714285d0) & + + in(i+6,j-4) * (-0.0011160714285714285d0) & + + in(i+7,j-4) * (-0.0011160714285714285d0) & + + in(i+8,j-4) * (-0.0011160714285714285d0) & + + in(i-3,j-3) * (-0.010416666666666666d0) & + + in(i+1,j-3) * (-0.0020833333333333333d0) & + + in(i+2,j-3) * (-0.0020833333333333333d0) & + + in(i+3,j-3) * (-0.0020833333333333333d0) & + + in(i+4,j-3) * (-0.0020833333333333333d0) & + + in(i+5,j-3) * (-0.0020833333333333333d0) & + + in(i+6,j-3) * (-0.0020833333333333333d0) & + + in(i+7,j-3) * (-0.0020833333333333333d0) & + + in(i+8,j-3) * (-0.0020833333333333333d0) & + + in(i-2,j-2) * (-0.015625d0) & + + in(i+1,j-2) * (-0.005208333333333333d0) & + + in(i+2,j-2) * (-0.005208333333333333d0) & + + in(i+3,j-2) * (-0.005208333333333333d0) & + + in(i+4,j-2) * (-0.005208333333333333d0) & + + in(i+5,j-2) * (-0.005208333333333333d0) & + + in(i+6,j-2) * (-0.005208333333333333d0) & + + in(i+7,j-2) * (-0.005208333333333333d0) & + + in(i+8,j-2) * (-0.005208333333333333d0) & + + in(i-1,j-1) * (-0.03125d0) & + + in(i+1,j-1) * (-0.03125d0) & + + in(i+2,j-1) * (-0.03125d0) & + + in(i+3,j-1) * (-0.03125d0) & + + in(i+4,j-1) * (-0.03125d0) & + + in(i+5,j-1) * (-0.03125d0) & + + in(i+6,j-1) * (-0.03125d0) & + + in(i+7,j-1) * (-0.03125d0) & + + in(i+8,j-1) * (-0.03125d0) & + + in(i-8,j+1) * (-0.00026041666666666666d0) & + + in(i-7,j+1) * (-0.00034340659340659343d0) & + + in(i-6,j+1) * (-0.0004734848484848485d0) & + + in(i-5,j+1) * (-0.0006944444444444445d0) & + + in(i-4,j+1) * (-0.0011160714285714285d0) & + + in(i-3,j+1) * (-0.0020833333333333333d0) & + + in(i-2,j+1) * (-0.005208333333333333d0) & + + in(i-1,j+1) * (-0.03125d0) & + + in(i+1,j+1) * (0.03125d0) & + + in(i+2,j+1) * (0.005208333333333333d0) & + + in(i+3,j+1) * (0.0020833333333333333d0) & + + in(i+4,j+1) * (0.0011160714285714285d0) & + + in(i+5,j+1) * (0.0006944444444444445d0) & + + in(i+6,j+1) * (0.0004734848484848485d0) & + + in(i+7,j+1) * (0.00034340659340659343d0) & + + in(i+8,j+1) * (0.00026041666666666666d0) & + + in(i-8,j+2) * (-0.00026041666666666666d0) & + + in(i-7,j+2) * (-0.00034340659340659343d0) & + + in(i-6,j+2) * (-0.0004734848484848485d0) & + + in(i-5,j+2) * (-0.0006944444444444445d0) & + + in(i-4,j+2) * (-0.0011160714285714285d0) & + + in(i-3,j+2) * (-0.0020833333333333333d0) & + + in(i-2,j+2) * (-0.005208333333333333d0) & + + in(i-1,j+2) * (-0.03125d0) & + + in(i+1,j+2) * (0.005208333333333333d0) & + + in(i+2,j+2) * (0.015625d0) & + + in(i+3,j+2) * (0.0020833333333333333d0) & + + in(i+4,j+2) * (0.0011160714285714285d0) & + + in(i+5,j+2) * (0.0006944444444444445d0) & + + in(i+6,j+2) * (0.0004734848484848485d0) & + + in(i+7,j+2) * (0.00034340659340659343d0) & + + in(i+8,j+2) * (0.00026041666666666666d0) & + + in(i-8,j+3) * (-0.00026041666666666666d0) & + + in(i-7,j+3) * (-0.00034340659340659343d0) & + + in(i-6,j+3) * (-0.0004734848484848485d0) & + + in(i-5,j+3) * (-0.0006944444444444445d0) & + + in(i-4,j+3) * (-0.0011160714285714285d0) & + + in(i-3,j+3) * (-0.0020833333333333333d0) & + + in(i-2,j+3) * (-0.005208333333333333d0) & + + in(i-1,j+3) * (-0.03125d0) & + + in(i+1,j+3) * (0.0020833333333333333d0) & + + in(i+2,j+3) * (0.0020833333333333333d0) & + + in(i+3,j+3) * (0.010416666666666666d0) & + + in(i+4,j+3) * (0.0011160714285714285d0) & + + in(i+5,j+3) * (0.0006944444444444445d0) & + + in(i+6,j+3) * (0.0004734848484848485d0) & + + in(i+7,j+3) * (0.00034340659340659343d0) & + + in(i+8,j+3) * (0.00026041666666666666d0) & + + in(i-8,j+4) * (-0.00026041666666666666d0) & + + in(i-7,j+4) * (-0.00034340659340659343d0) & + + in(i-6,j+4) * (-0.0004734848484848485d0) & + + in(i-5,j+4) * (-0.0006944444444444445d0) & + + in(i-4,j+4) * (-0.0011160714285714285d0) & + + in(i-3,j+4) * (-0.0020833333333333333d0) & + + in(i-2,j+4) * (-0.005208333333333333d0) & + + in(i-1,j+4) * (-0.03125d0) & + + in(i+1,j+4) * (0.0011160714285714285d0) & + + in(i+2,j+4) * (0.0011160714285714285d0) & + + in(i+3,j+4) * (0.0011160714285714285d0) & + + in(i+4,j+4) * (0.0078125d0) & + + in(i+5,j+4) * (0.0006944444444444445d0) & + + in(i+6,j+4) * (0.0004734848484848485d0) & + + in(i+7,j+4) * (0.00034340659340659343d0) & + + in(i+8,j+4) * (0.00026041666666666666d0) & + + in(i-8,j+5) * (-0.00026041666666666666d0) & + + in(i-7,j+5) * (-0.00034340659340659343d0) & + + in(i-6,j+5) * (-0.0004734848484848485d0) & + + in(i-5,j+5) * (-0.0006944444444444445d0) & + + in(i-4,j+5) * (-0.0011160714285714285d0) & + + in(i-3,j+5) * (-0.0020833333333333333d0) & + + in(i-2,j+5) * (-0.005208333333333333d0) & + + in(i-1,j+5) * (-0.03125d0) & + + in(i+1,j+5) * (0.0006944444444444445d0) & + + in(i+2,j+5) * (0.0006944444444444445d0) & + + in(i+3,j+5) * (0.0006944444444444445d0) & + + in(i+4,j+5) * (0.0006944444444444445d0) & + + in(i+5,j+5) * (0.00625d0) & + + in(i+6,j+5) * (0.0004734848484848485d0) & + + in(i+7,j+5) * (0.00034340659340659343d0) & + + in(i+8,j+5) * (0.00026041666666666666d0) & + + in(i-8,j+6) * (-0.00026041666666666666d0) & + + in(i-7,j+6) * (-0.00034340659340659343d0) & + + in(i-6,j+6) * (-0.0004734848484848485d0) & + + in(i-5,j+6) * (-0.0006944444444444445d0) & + + in(i-4,j+6) * (-0.0011160714285714285d0) & + + in(i-3,j+6) * (-0.0020833333333333333d0) & + + in(i-2,j+6) * (-0.005208333333333333d0) & + + in(i-1,j+6) * (-0.03125d0) & + + in(i+1,j+6) * (0.0004734848484848485d0) & + + in(i+2,j+6) * (0.0004734848484848485d0) & + + in(i+3,j+6) * (0.0004734848484848485d0) & + + in(i+4,j+6) * (0.0004734848484848485d0) & + + in(i+5,j+6) * (0.0004734848484848485d0) & + + in(i+6,j+6) * (0.005208333333333333d0) & + + in(i+7,j+6) * (0.00034340659340659343d0) & + + in(i+8,j+6) * (0.00026041666666666666d0) & + + in(i-8,j+7) * (-0.00026041666666666666d0) & + + in(i-7,j+7) * (-0.00034340659340659343d0) & + + in(i-6,j+7) * (-0.0004734848484848485d0) & + + in(i-5,j+7) * (-0.0006944444444444445d0) & + + in(i-4,j+7) * (-0.0011160714285714285d0) & + + in(i-3,j+7) * (-0.0020833333333333333d0) & + + in(i-2,j+7) * (-0.005208333333333333d0) & + + in(i-1,j+7) * (-0.03125d0) & + + in(i+1,j+7) * (0.00034340659340659343d0) & + + in(i+2,j+7) * (0.00034340659340659343d0) & + + in(i+3,j+7) * (0.00034340659340659343d0) & + + in(i+4,j+7) * (0.00034340659340659343d0) & + + in(i+5,j+7) * (0.00034340659340659343d0) & + + in(i+6,j+7) * (0.00034340659340659343d0) & + + in(i+7,j+7) * (0.004464285714285714d0) & + + in(i+8,j+7) * (0.00026041666666666666d0) & + + in(i-8,j+8) * (-0.00026041666666666666d0) & + + in(i-7,j+8) * (-0.00034340659340659343d0) & + + in(i-6,j+8) * (-0.0004734848484848485d0) & + + in(i-5,j+8) * (-0.0006944444444444445d0) & + + in(i-4,j+8) * (-0.0011160714285714285d0) & + + in(i-3,j+8) * (-0.0020833333333333333d0) & + + in(i-2,j+8) * (-0.005208333333333333d0) & + + in(i-1,j+8) * (-0.03125d0) & + + in(i+1,j+8) * (0.00026041666666666666d0) & + + in(i+2,j+8) * (0.00026041666666666666d0) & + + in(i+3,j+8) * (0.00026041666666666666d0) & + + in(i+4,j+8) * (0.00026041666666666666d0) & + + in(i+5,j+8) * (0.00026041666666666666d0) & + + in(i+6,j+8) * (0.00026041666666666666d0) & + + in(i+7,j+8) * (0.00026041666666666666d0) & + + in(i+8,j+8) * (0.00390625d0) & +0.0 end do !$omp end simd @@ -1159,260 +1193,262 @@ subroutine grid9(n, in, out) !$omp do do i=9,n-9-1 !$omp simd + do j=9,n-9-1 + do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i-9,j-9) * (-0.0030864197530864196) & - + in(i+1,j-9) * (-0.00018155410312273057) & - + in(i+2,j-9) * (-0.00018155410312273057) & - + in(i+3,j-9) * (-0.00018155410312273057) & - + in(i+4,j-9) * (-0.00018155410312273057) & - + in(i+5,j-9) * (-0.00018155410312273057) & - + in(i+6,j-9) * (-0.00018155410312273057) & - + in(i+7,j-9) * (-0.00018155410312273057) & - + in(i+8,j-9) * (-0.00018155410312273057) & - + in(i+9,j-9) * (-0.00018155410312273057) & - + in(i-8,j-8) * (-0.003472222222222222) & - + in(i+1,j-8) * (-0.0002314814814814815) & - + in(i+2,j-8) * (-0.0002314814814814815) & - + in(i+3,j-8) * (-0.0002314814814814815) & - + in(i+4,j-8) * (-0.0002314814814814815) & - + in(i+5,j-8) * (-0.0002314814814814815) & - + in(i+6,j-8) * (-0.0002314814814814815) & - + in(i+7,j-8) * (-0.0002314814814814815) & - + in(i+8,j-8) * (-0.0002314814814814815) & - + in(i+9,j-8) * (-0.0002314814814814815) & - + in(i-7,j-7) * (-0.003968253968253968) & - + in(i+1,j-7) * (-0.00030525030525030525) & - + in(i+2,j-7) * (-0.00030525030525030525) & - + in(i+3,j-7) * (-0.00030525030525030525) & - + in(i+4,j-7) * (-0.00030525030525030525) & - + in(i+5,j-7) * (-0.00030525030525030525) & - + in(i+6,j-7) * (-0.00030525030525030525) & - + in(i+7,j-7) * (-0.00030525030525030525) & - + in(i+8,j-7) * (-0.00030525030525030525) & - + in(i+9,j-7) * (-0.00030525030525030525) & - + in(i-6,j-6) * (-0.004629629629629629) & - + in(i+1,j-6) * (-0.00042087542087542086) & - + in(i+2,j-6) * (-0.00042087542087542086) & - + in(i+3,j-6) * (-0.00042087542087542086) & - + in(i+4,j-6) * (-0.00042087542087542086) & - + in(i+5,j-6) * (-0.00042087542087542086) & - + in(i+6,j-6) * (-0.00042087542087542086) & - + in(i+7,j-6) * (-0.00042087542087542086) & - + in(i+8,j-6) * (-0.00042087542087542086) & - + in(i+9,j-6) * (-0.00042087542087542086) & - + in(i-5,j-5) * (-0.005555555555555556) & - + in(i+1,j-5) * (-0.0006172839506172839) & - + in(i+2,j-5) * (-0.0006172839506172839) & - + in(i+3,j-5) * (-0.0006172839506172839) & - + in(i+4,j-5) * (-0.0006172839506172839) & - + in(i+5,j-5) * (-0.0006172839506172839) & - + in(i+6,j-5) * (-0.0006172839506172839) & - + in(i+7,j-5) * (-0.0006172839506172839) & - + in(i+8,j-5) * (-0.0006172839506172839) & - + in(i+9,j-5) * (-0.0006172839506172839) & - + in(i-4,j-4) * (-0.006944444444444444) & - + in(i+1,j-4) * (-0.000992063492063492) & - + in(i+2,j-4) * (-0.000992063492063492) & - + in(i+3,j-4) * (-0.000992063492063492) & - + in(i+4,j-4) * (-0.000992063492063492) & - + in(i+5,j-4) * (-0.000992063492063492) & - + in(i+6,j-4) * (-0.000992063492063492) & - + in(i+7,j-4) * (-0.000992063492063492) & - + in(i+8,j-4) * (-0.000992063492063492) & - + in(i+9,j-4) * (-0.000992063492063492) & - + in(i-3,j-3) * (-0.009259259259259259) & - + in(i+1,j-3) * (-0.001851851851851852) & - + in(i+2,j-3) * (-0.001851851851851852) & - + in(i+3,j-3) * (-0.001851851851851852) & - + in(i+4,j-3) * (-0.001851851851851852) & - + in(i+5,j-3) * (-0.001851851851851852) & - + in(i+6,j-3) * (-0.001851851851851852) & - + in(i+7,j-3) * (-0.001851851851851852) & - + in(i+8,j-3) * (-0.001851851851851852) & - + in(i+9,j-3) * (-0.001851851851851852) & - + in(i-2,j-2) * (-0.013888888888888888) & - + in(i+1,j-2) * (-0.004629629629629629) & - + in(i+2,j-2) * (-0.004629629629629629) & - + in(i+3,j-2) * (-0.004629629629629629) & - + in(i+4,j-2) * (-0.004629629629629629) & - + in(i+5,j-2) * (-0.004629629629629629) & - + in(i+6,j-2) * (-0.004629629629629629) & - + in(i+7,j-2) * (-0.004629629629629629) & - + in(i+8,j-2) * (-0.004629629629629629) & - + in(i+9,j-2) * (-0.004629629629629629) & - + in(i-1,j-1) * (-0.027777777777777776) & - + in(i+1,j-1) * (-0.027777777777777776) & - + in(i+2,j-1) * (-0.027777777777777776) & - + in(i+3,j-1) * (-0.027777777777777776) & - + in(i+4,j-1) * (-0.027777777777777776) & - + in(i+5,j-1) * (-0.027777777777777776) & - + in(i+6,j-1) * (-0.027777777777777776) & - + in(i+7,j-1) * (-0.027777777777777776) & - + in(i+8,j-1) * (-0.027777777777777776) & - + in(i+9,j-1) * (-0.027777777777777776) & - + in(i-9,j+1) * (-0.00018155410312273057) & - + in(i-8,j+1) * (-0.0002314814814814815) & - + in(i-7,j+1) * (-0.00030525030525030525) & - + in(i-6,j+1) * (-0.00042087542087542086) & - + in(i-5,j+1) * (-0.0006172839506172839) & - + in(i-4,j+1) * (-0.000992063492063492) & - + in(i-3,j+1) * (-0.001851851851851852) & - + in(i-2,j+1) * (-0.004629629629629629) & - + in(i-1,j+1) * (-0.027777777777777776) & - + in(i+1,j+1) * (0.027777777777777776) & - + in(i+2,j+1) * (0.004629629629629629) & - + in(i+3,j+1) * (0.001851851851851852) & - + in(i+4,j+1) * (0.000992063492063492) & - + in(i+5,j+1) * (0.0006172839506172839) & - + in(i+6,j+1) * (0.00042087542087542086) & - + in(i+7,j+1) * (0.00030525030525030525) & - + in(i+8,j+1) * (0.0002314814814814815) & - + in(i+9,j+1) * (0.00018155410312273057) & - + in(i-9,j+2) * (-0.00018155410312273057) & - + in(i-8,j+2) * (-0.0002314814814814815) & - + in(i-7,j+2) * (-0.00030525030525030525) & - + in(i-6,j+2) * (-0.00042087542087542086) & - + in(i-5,j+2) * (-0.0006172839506172839) & - + in(i-4,j+2) * (-0.000992063492063492) & - + in(i-3,j+2) * (-0.001851851851851852) & - + in(i-2,j+2) * (-0.004629629629629629) & - + in(i-1,j+2) * (-0.027777777777777776) & - + in(i+1,j+2) * (0.004629629629629629) & - + in(i+2,j+2) * (0.013888888888888888) & - + in(i+3,j+2) * (0.001851851851851852) & - + in(i+4,j+2) * (0.000992063492063492) & - + in(i+5,j+2) * (0.0006172839506172839) & - + in(i+6,j+2) * (0.00042087542087542086) & - + in(i+7,j+2) * (0.00030525030525030525) & - + in(i+8,j+2) * (0.0002314814814814815) & - + in(i+9,j+2) * (0.00018155410312273057) & - + in(i-9,j+3) * (-0.00018155410312273057) & - + in(i-8,j+3) * (-0.0002314814814814815) & - + in(i-7,j+3) * (-0.00030525030525030525) & - + in(i-6,j+3) * (-0.00042087542087542086) & - + in(i-5,j+3) * (-0.0006172839506172839) & - + in(i-4,j+3) * (-0.000992063492063492) & - + in(i-3,j+3) * (-0.001851851851851852) & - + in(i-2,j+3) * (-0.004629629629629629) & - + in(i-1,j+3) * (-0.027777777777777776) & - + in(i+1,j+3) * (0.001851851851851852) & - + in(i+2,j+3) * (0.001851851851851852) & - + in(i+3,j+3) * (0.009259259259259259) & - + in(i+4,j+3) * (0.000992063492063492) & - + in(i+5,j+3) * (0.0006172839506172839) & - + in(i+6,j+3) * (0.00042087542087542086) & - + in(i+7,j+3) * (0.00030525030525030525) & - + in(i+8,j+3) * (0.0002314814814814815) & - + in(i+9,j+3) * (0.00018155410312273057) & - + in(i-9,j+4) * (-0.00018155410312273057) & - + in(i-8,j+4) * (-0.0002314814814814815) & - + in(i-7,j+4) * (-0.00030525030525030525) & - + in(i-6,j+4) * (-0.00042087542087542086) & - + in(i-5,j+4) * (-0.0006172839506172839) & - + in(i-4,j+4) * (-0.000992063492063492) & - + in(i-3,j+4) * (-0.001851851851851852) & - + in(i-2,j+4) * (-0.004629629629629629) & - + in(i-1,j+4) * (-0.027777777777777776) & - + in(i+1,j+4) * (0.000992063492063492) & - + in(i+2,j+4) * (0.000992063492063492) & - + in(i+3,j+4) * (0.000992063492063492) & - + in(i+4,j+4) * (0.006944444444444444) & - + in(i+5,j+4) * (0.0006172839506172839) & - + in(i+6,j+4) * (0.00042087542087542086) & - + in(i+7,j+4) * (0.00030525030525030525) & - + in(i+8,j+4) * (0.0002314814814814815) & - + in(i+9,j+4) * (0.00018155410312273057) & - + in(i-9,j+5) * (-0.00018155410312273057) & - + in(i-8,j+5) * (-0.0002314814814814815) & - + in(i-7,j+5) * (-0.00030525030525030525) & - + in(i-6,j+5) * (-0.00042087542087542086) & - + in(i-5,j+5) * (-0.0006172839506172839) & - + in(i-4,j+5) * (-0.000992063492063492) & - + in(i-3,j+5) * (-0.001851851851851852) & - + in(i-2,j+5) * (-0.004629629629629629) & - + in(i-1,j+5) * (-0.027777777777777776) & - + in(i+1,j+5) * (0.0006172839506172839) & - + in(i+2,j+5) * (0.0006172839506172839) & - + in(i+3,j+5) * (0.0006172839506172839) & - + in(i+4,j+5) * (0.0006172839506172839) & - + in(i+5,j+5) * (0.005555555555555556) & - + in(i+6,j+5) * (0.00042087542087542086) & - + in(i+7,j+5) * (0.00030525030525030525) & - + in(i+8,j+5) * (0.0002314814814814815) & - + in(i+9,j+5) * (0.00018155410312273057) & - + in(i-9,j+6) * (-0.00018155410312273057) & - + in(i-8,j+6) * (-0.0002314814814814815) & - + in(i-7,j+6) * (-0.00030525030525030525) & - + in(i-6,j+6) * (-0.00042087542087542086) & - + in(i-5,j+6) * (-0.0006172839506172839) & - + in(i-4,j+6) * (-0.000992063492063492) & - + in(i-3,j+6) * (-0.001851851851851852) & - + in(i-2,j+6) * (-0.004629629629629629) & - + in(i-1,j+6) * (-0.027777777777777776) & - + in(i+1,j+6) * (0.00042087542087542086) & - + in(i+2,j+6) * (0.00042087542087542086) & - + in(i+3,j+6) * (0.00042087542087542086) & - + in(i+4,j+6) * (0.00042087542087542086) & - + in(i+5,j+6) * (0.00042087542087542086) & - + in(i+6,j+6) * (0.004629629629629629) & - + in(i+7,j+6) * (0.00030525030525030525) & - + in(i+8,j+6) * (0.0002314814814814815) & - + in(i+9,j+6) * (0.00018155410312273057) & - + in(i-9,j+7) * (-0.00018155410312273057) & - + in(i-8,j+7) * (-0.0002314814814814815) & - + in(i-7,j+7) * (-0.00030525030525030525) & - + in(i-6,j+7) * (-0.00042087542087542086) & - + in(i-5,j+7) * (-0.0006172839506172839) & - + in(i-4,j+7) * (-0.000992063492063492) & - + in(i-3,j+7) * (-0.001851851851851852) & - + in(i-2,j+7) * (-0.004629629629629629) & - + in(i-1,j+7) * (-0.027777777777777776) & - + in(i+1,j+7) * (0.00030525030525030525) & - + in(i+2,j+7) * (0.00030525030525030525) & - + in(i+3,j+7) * (0.00030525030525030525) & - + in(i+4,j+7) * (0.00030525030525030525) & - + in(i+5,j+7) * (0.00030525030525030525) & - + in(i+6,j+7) * (0.00030525030525030525) & - + in(i+7,j+7) * (0.003968253968253968) & - + in(i+8,j+7) * (0.0002314814814814815) & - + in(i+9,j+7) * (0.00018155410312273057) & - + in(i-9,j+8) * (-0.00018155410312273057) & - + in(i-8,j+8) * (-0.0002314814814814815) & - + in(i-7,j+8) * (-0.00030525030525030525) & - + in(i-6,j+8) * (-0.00042087542087542086) & - + in(i-5,j+8) * (-0.0006172839506172839) & - + in(i-4,j+8) * (-0.000992063492063492) & - + in(i-3,j+8) * (-0.001851851851851852) & - + in(i-2,j+8) * (-0.004629629629629629) & - + in(i-1,j+8) * (-0.027777777777777776) & - + in(i+1,j+8) * (0.0002314814814814815) & - + in(i+2,j+8) * (0.0002314814814814815) & - + in(i+3,j+8) * (0.0002314814814814815) & - + in(i+4,j+8) * (0.0002314814814814815) & - + in(i+5,j+8) * (0.0002314814814814815) & - + in(i+6,j+8) * (0.0002314814814814815) & - + in(i+7,j+8) * (0.0002314814814814815) & - + in(i+8,j+8) * (0.003472222222222222) & - + in(i+9,j+8) * (0.00018155410312273057) & - + in(i-9,j+9) * (-0.00018155410312273057) & - + in(i-8,j+9) * (-0.0002314814814814815) & - + in(i-7,j+9) * (-0.00030525030525030525) & - + in(i-6,j+9) * (-0.00042087542087542086) & - + in(i-5,j+9) * (-0.0006172839506172839) & - + in(i-4,j+9) * (-0.000992063492063492) & - + in(i-3,j+9) * (-0.001851851851851852) & - + in(i-2,j+9) * (-0.004629629629629629) & - + in(i-1,j+9) * (-0.027777777777777776) & - + in(i+1,j+9) * (0.00018155410312273057) & - + in(i+2,j+9) * (0.00018155410312273057) & - + in(i+3,j+9) * (0.00018155410312273057) & - + in(i+4,j+9) * (0.00018155410312273057) & - + in(i+5,j+9) * (0.00018155410312273057) & - + in(i+6,j+9) * (0.00018155410312273057) & - + in(i+7,j+9) * (0.00018155410312273057) & - + in(i+8,j+9) * (0.00018155410312273057) & - + in(i+9,j+9) * (0.0030864197530864196) & + + in(i-9,j-9) * (-0.0030864197530864196d0) & + + in(i+1,j-9) * (-0.00018155410312273057d0) & + + in(i+2,j-9) * (-0.00018155410312273057d0) & + + in(i+3,j-9) * (-0.00018155410312273057d0) & + + in(i+4,j-9) * (-0.00018155410312273057d0) & + + in(i+5,j-9) * (-0.00018155410312273057d0) & + + in(i+6,j-9) * (-0.00018155410312273057d0) & + + in(i+7,j-9) * (-0.00018155410312273057d0) & + + in(i+8,j-9) * (-0.00018155410312273057d0) & + + in(i+9,j-9) * (-0.00018155410312273057d0) & + + in(i-8,j-8) * (-0.003472222222222222d0) & + + in(i+1,j-8) * (-0.0002314814814814815d0) & + + in(i+2,j-8) * (-0.0002314814814814815d0) & + + in(i+3,j-8) * (-0.0002314814814814815d0) & + + in(i+4,j-8) * (-0.0002314814814814815d0) & + + in(i+5,j-8) * (-0.0002314814814814815d0) & + + in(i+6,j-8) * (-0.0002314814814814815d0) & + + in(i+7,j-8) * (-0.0002314814814814815d0) & + + in(i+8,j-8) * (-0.0002314814814814815d0) & + + in(i+9,j-8) * (-0.0002314814814814815d0) & + + in(i-7,j-7) * (-0.003968253968253968d0) & + + in(i+1,j-7) * (-0.00030525030525030525d0) & + + in(i+2,j-7) * (-0.00030525030525030525d0) & + + in(i+3,j-7) * (-0.00030525030525030525d0) & + + in(i+4,j-7) * (-0.00030525030525030525d0) & + + in(i+5,j-7) * (-0.00030525030525030525d0) & + + in(i+6,j-7) * (-0.00030525030525030525d0) & + + in(i+7,j-7) * (-0.00030525030525030525d0) & + + in(i+8,j-7) * (-0.00030525030525030525d0) & + + in(i+9,j-7) * (-0.00030525030525030525d0) & + + in(i-6,j-6) * (-0.004629629629629629d0) & + + in(i+1,j-6) * (-0.00042087542087542086d0) & + + in(i+2,j-6) * (-0.00042087542087542086d0) & + + in(i+3,j-6) * (-0.00042087542087542086d0) & + + in(i+4,j-6) * (-0.00042087542087542086d0) & + + in(i+5,j-6) * (-0.00042087542087542086d0) & + + in(i+6,j-6) * (-0.00042087542087542086d0) & + + in(i+7,j-6) * (-0.00042087542087542086d0) & + + in(i+8,j-6) * (-0.00042087542087542086d0) & + + in(i+9,j-6) * (-0.00042087542087542086d0) & + + in(i-5,j-5) * (-0.005555555555555556d0) & + + in(i+1,j-5) * (-0.0006172839506172839d0) & + + in(i+2,j-5) * (-0.0006172839506172839d0) & + + in(i+3,j-5) * (-0.0006172839506172839d0) & + + in(i+4,j-5) * (-0.0006172839506172839d0) & + + in(i+5,j-5) * (-0.0006172839506172839d0) & + + in(i+6,j-5) * (-0.0006172839506172839d0) & + + in(i+7,j-5) * (-0.0006172839506172839d0) & + + in(i+8,j-5) * (-0.0006172839506172839d0) & + + in(i+9,j-5) * (-0.0006172839506172839d0) & + + in(i-4,j-4) * (-0.006944444444444444d0) & + + in(i+1,j-4) * (-0.000992063492063492d0) & + + in(i+2,j-4) * (-0.000992063492063492d0) & + + in(i+3,j-4) * (-0.000992063492063492d0) & + + in(i+4,j-4) * (-0.000992063492063492d0) & + + in(i+5,j-4) * (-0.000992063492063492d0) & + + in(i+6,j-4) * (-0.000992063492063492d0) & + + in(i+7,j-4) * (-0.000992063492063492d0) & + + in(i+8,j-4) * (-0.000992063492063492d0) & + + in(i+9,j-4) * (-0.000992063492063492d0) & + + in(i-3,j-3) * (-0.009259259259259259d0) & + + in(i+1,j-3) * (-0.001851851851851852d0) & + + in(i+2,j-3) * (-0.001851851851851852d0) & + + in(i+3,j-3) * (-0.001851851851851852d0) & + + in(i+4,j-3) * (-0.001851851851851852d0) & + + in(i+5,j-3) * (-0.001851851851851852d0) & + + in(i+6,j-3) * (-0.001851851851851852d0) & + + in(i+7,j-3) * (-0.001851851851851852d0) & + + in(i+8,j-3) * (-0.001851851851851852d0) & + + in(i+9,j-3) * (-0.001851851851851852d0) & + + in(i-2,j-2) * (-0.013888888888888888d0) & + + in(i+1,j-2) * (-0.004629629629629629d0) & + + in(i+2,j-2) * (-0.004629629629629629d0) & + + in(i+3,j-2) * (-0.004629629629629629d0) & + + in(i+4,j-2) * (-0.004629629629629629d0) & + + in(i+5,j-2) * (-0.004629629629629629d0) & + + in(i+6,j-2) * (-0.004629629629629629d0) & + + in(i+7,j-2) * (-0.004629629629629629d0) & + + in(i+8,j-2) * (-0.004629629629629629d0) & + + in(i+9,j-2) * (-0.004629629629629629d0) & + + in(i-1,j-1) * (-0.027777777777777776d0) & + + in(i+1,j-1) * (-0.027777777777777776d0) & + + in(i+2,j-1) * (-0.027777777777777776d0) & + + in(i+3,j-1) * (-0.027777777777777776d0) & + + in(i+4,j-1) * (-0.027777777777777776d0) & + + in(i+5,j-1) * (-0.027777777777777776d0) & + + in(i+6,j-1) * (-0.027777777777777776d0) & + + in(i+7,j-1) * (-0.027777777777777776d0) & + + in(i+8,j-1) * (-0.027777777777777776d0) & + + in(i+9,j-1) * (-0.027777777777777776d0) & + + in(i-9,j+1) * (-0.00018155410312273057d0) & + + in(i-8,j+1) * (-0.0002314814814814815d0) & + + in(i-7,j+1) * (-0.00030525030525030525d0) & + + in(i-6,j+1) * (-0.00042087542087542086d0) & + + in(i-5,j+1) * (-0.0006172839506172839d0) & + + in(i-4,j+1) * (-0.000992063492063492d0) & + + in(i-3,j+1) * (-0.001851851851851852d0) & + + in(i-2,j+1) * (-0.004629629629629629d0) & + + in(i-1,j+1) * (-0.027777777777777776d0) & + + in(i+1,j+1) * (0.027777777777777776d0) & + + in(i+2,j+1) * (0.004629629629629629d0) & + + in(i+3,j+1) * (0.001851851851851852d0) & + + in(i+4,j+1) * (0.000992063492063492d0) & + + in(i+5,j+1) * (0.0006172839506172839d0) & + + in(i+6,j+1) * (0.00042087542087542086d0) & + + in(i+7,j+1) * (0.00030525030525030525d0) & + + in(i+8,j+1) * (0.0002314814814814815d0) & + + in(i+9,j+1) * (0.00018155410312273057d0) & + + in(i-9,j+2) * (-0.00018155410312273057d0) & + + in(i-8,j+2) * (-0.0002314814814814815d0) & + + in(i-7,j+2) * (-0.00030525030525030525d0) & + + in(i-6,j+2) * (-0.00042087542087542086d0) & + + in(i-5,j+2) * (-0.0006172839506172839d0) & + + in(i-4,j+2) * (-0.000992063492063492d0) & + + in(i-3,j+2) * (-0.001851851851851852d0) & + + in(i-2,j+2) * (-0.004629629629629629d0) & + + in(i-1,j+2) * (-0.027777777777777776d0) & + + in(i+1,j+2) * (0.004629629629629629d0) & + + in(i+2,j+2) * (0.013888888888888888d0) & + + in(i+3,j+2) * (0.001851851851851852d0) & + + in(i+4,j+2) * (0.000992063492063492d0) & + + in(i+5,j+2) * (0.0006172839506172839d0) & + + in(i+6,j+2) * (0.00042087542087542086d0) & + + in(i+7,j+2) * (0.00030525030525030525d0) & + + in(i+8,j+2) * (0.0002314814814814815d0) & + + in(i+9,j+2) * (0.00018155410312273057d0) & + + in(i-9,j+3) * (-0.00018155410312273057d0) & + + in(i-8,j+3) * (-0.0002314814814814815d0) & + + in(i-7,j+3) * (-0.00030525030525030525d0) & + + in(i-6,j+3) * (-0.00042087542087542086d0) & + + in(i-5,j+3) * (-0.0006172839506172839d0) & + + in(i-4,j+3) * (-0.000992063492063492d0) & + + in(i-3,j+3) * (-0.001851851851851852d0) & + + in(i-2,j+3) * (-0.004629629629629629d0) & + + in(i-1,j+3) * (-0.027777777777777776d0) & + + in(i+1,j+3) * (0.001851851851851852d0) & + + in(i+2,j+3) * (0.001851851851851852d0) & + + in(i+3,j+3) * (0.009259259259259259d0) & + + in(i+4,j+3) * (0.000992063492063492d0) & + + in(i+5,j+3) * (0.0006172839506172839d0) & + + in(i+6,j+3) * (0.00042087542087542086d0) & + + in(i+7,j+3) * (0.00030525030525030525d0) & + + in(i+8,j+3) * (0.0002314814814814815d0) & + + in(i+9,j+3) * (0.00018155410312273057d0) & + + in(i-9,j+4) * (-0.00018155410312273057d0) & + + in(i-8,j+4) * (-0.0002314814814814815d0) & + + in(i-7,j+4) * (-0.00030525030525030525d0) & + + in(i-6,j+4) * (-0.00042087542087542086d0) & + + in(i-5,j+4) * (-0.0006172839506172839d0) & + + in(i-4,j+4) * (-0.000992063492063492d0) & + + in(i-3,j+4) * (-0.001851851851851852d0) & + + in(i-2,j+4) * (-0.004629629629629629d0) & + + in(i-1,j+4) * (-0.027777777777777776d0) & + + in(i+1,j+4) * (0.000992063492063492d0) & + + in(i+2,j+4) * (0.000992063492063492d0) & + + in(i+3,j+4) * (0.000992063492063492d0) & + + in(i+4,j+4) * (0.006944444444444444d0) & + + in(i+5,j+4) * (0.0006172839506172839d0) & + + in(i+6,j+4) * (0.00042087542087542086d0) & + + in(i+7,j+4) * (0.00030525030525030525d0) & + + in(i+8,j+4) * (0.0002314814814814815d0) & + + in(i+9,j+4) * (0.00018155410312273057d0) & + + in(i-9,j+5) * (-0.00018155410312273057d0) & + + in(i-8,j+5) * (-0.0002314814814814815d0) & + + in(i-7,j+5) * (-0.00030525030525030525d0) & + + in(i-6,j+5) * (-0.00042087542087542086d0) & + + in(i-5,j+5) * (-0.0006172839506172839d0) & + + in(i-4,j+5) * (-0.000992063492063492d0) & + + in(i-3,j+5) * (-0.001851851851851852d0) & + + in(i-2,j+5) * (-0.004629629629629629d0) & + + in(i-1,j+5) * (-0.027777777777777776d0) & + + in(i+1,j+5) * (0.0006172839506172839d0) & + + in(i+2,j+5) * (0.0006172839506172839d0) & + + in(i+3,j+5) * (0.0006172839506172839d0) & + + in(i+4,j+5) * (0.0006172839506172839d0) & + + in(i+5,j+5) * (0.005555555555555556d0) & + + in(i+6,j+5) * (0.00042087542087542086d0) & + + in(i+7,j+5) * (0.00030525030525030525d0) & + + in(i+8,j+5) * (0.0002314814814814815d0) & + + in(i+9,j+5) * (0.00018155410312273057d0) & + + in(i-9,j+6) * (-0.00018155410312273057d0) & + + in(i-8,j+6) * (-0.0002314814814814815d0) & + + in(i-7,j+6) * (-0.00030525030525030525d0) & + + in(i-6,j+6) * (-0.00042087542087542086d0) & + + in(i-5,j+6) * (-0.0006172839506172839d0) & + + in(i-4,j+6) * (-0.000992063492063492d0) & + + in(i-3,j+6) * (-0.001851851851851852d0) & + + in(i-2,j+6) * (-0.004629629629629629d0) & + + in(i-1,j+6) * (-0.027777777777777776d0) & + + in(i+1,j+6) * (0.00042087542087542086d0) & + + in(i+2,j+6) * (0.00042087542087542086d0) & + + in(i+3,j+6) * (0.00042087542087542086d0) & + + in(i+4,j+6) * (0.00042087542087542086d0) & + + in(i+5,j+6) * (0.00042087542087542086d0) & + + in(i+6,j+6) * (0.004629629629629629d0) & + + in(i+7,j+6) * (0.00030525030525030525d0) & + + in(i+8,j+6) * (0.0002314814814814815d0) & + + in(i+9,j+6) * (0.00018155410312273057d0) & + + in(i-9,j+7) * (-0.00018155410312273057d0) & + + in(i-8,j+7) * (-0.0002314814814814815d0) & + + in(i-7,j+7) * (-0.00030525030525030525d0) & + + in(i-6,j+7) * (-0.00042087542087542086d0) & + + in(i-5,j+7) * (-0.0006172839506172839d0) & + + in(i-4,j+7) * (-0.000992063492063492d0) & + + in(i-3,j+7) * (-0.001851851851851852d0) & + + in(i-2,j+7) * (-0.004629629629629629d0) & + + in(i-1,j+7) * (-0.027777777777777776d0) & + + in(i+1,j+7) * (0.00030525030525030525d0) & + + in(i+2,j+7) * (0.00030525030525030525d0) & + + in(i+3,j+7) * (0.00030525030525030525d0) & + + in(i+4,j+7) * (0.00030525030525030525d0) & + + in(i+5,j+7) * (0.00030525030525030525d0) & + + in(i+6,j+7) * (0.00030525030525030525d0) & + + in(i+7,j+7) * (0.003968253968253968d0) & + + in(i+8,j+7) * (0.0002314814814814815d0) & + + in(i+9,j+7) * (0.00018155410312273057d0) & + + in(i-9,j+8) * (-0.00018155410312273057d0) & + + in(i-8,j+8) * (-0.0002314814814814815d0) & + + in(i-7,j+8) * (-0.00030525030525030525d0) & + + in(i-6,j+8) * (-0.00042087542087542086d0) & + + in(i-5,j+8) * (-0.0006172839506172839d0) & + + in(i-4,j+8) * (-0.000992063492063492d0) & + + in(i-3,j+8) * (-0.001851851851851852d0) & + + in(i-2,j+8) * (-0.004629629629629629d0) & + + in(i-1,j+8) * (-0.027777777777777776d0) & + + in(i+1,j+8) * (0.0002314814814814815d0) & + + in(i+2,j+8) * (0.0002314814814814815d0) & + + in(i+3,j+8) * (0.0002314814814814815d0) & + + in(i+4,j+8) * (0.0002314814814814815d0) & + + in(i+5,j+8) * (0.0002314814814814815d0) & + + in(i+6,j+8) * (0.0002314814814814815d0) & + + in(i+7,j+8) * (0.0002314814814814815d0) & + + in(i+8,j+8) * (0.003472222222222222d0) & + + in(i+9,j+8) * (0.00018155410312273057d0) & + + in(i-9,j+9) * (-0.00018155410312273057d0) & + + in(i-8,j+9) * (-0.0002314814814814815d0) & + + in(i-7,j+9) * (-0.00030525030525030525d0) & + + in(i-6,j+9) * (-0.00042087542087542086d0) & + + in(i-5,j+9) * (-0.0006172839506172839d0) & + + in(i-4,j+9) * (-0.000992063492063492d0) & + + in(i-3,j+9) * (-0.001851851851851852d0) & + + in(i-2,j+9) * (-0.004629629629629629d0) & + + in(i-1,j+9) * (-0.027777777777777776d0) & + + in(i+1,j+9) * (0.00018155410312273057d0) & + + in(i+2,j+9) * (0.00018155410312273057d0) & + + in(i+3,j+9) * (0.00018155410312273057d0) & + + in(i+4,j+9) * (0.00018155410312273057d0) & + + in(i+5,j+9) * (0.00018155410312273057d0) & + + in(i+6,j+9) * (0.00018155410312273057d0) & + + in(i+7,j+9) * (0.00018155410312273057d0) & + + in(i+8,j+9) * (0.00018155410312273057d0) & + + in(i+9,j+9) * (0.0030864197530864196d0) & +0.0 end do !$omp end simd diff --git a/FORTRAN/stencil_pretty.f90 b/FORTRAN/stencil_pretty.f90 index 5e2b50d4e..cb4bf8052 100644 --- a/FORTRAN/stencil_pretty.f90 +++ b/FORTRAN/stencil_pretty.f90 @@ -8,10 +8,10 @@ subroutine star1(n, in, out) do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i+0,j-1) * (-0.5) & - + in(i-1,j+0) * (-0.5) & - + in(i+1,j+0) * (0.5) & - + in(i+0,j+1) * (0.5) & + + in(i+0,j-1) * (-0.5d0) & + + in(i-1,j+0) * (-0.5d0) & + + in(i+1,j+0) * (0.5d0) & + + in(i+0,j+1) * (0.5d0) & +0.0 end do end do @@ -27,14 +27,14 @@ subroutine star2(n, in, out) do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i+0,j-2) * (-0.125) & - + in(i+0,j-1) * (-0.25) & - + in(i-2,j+0) * (-0.125) & - + in(i-1,j+0) * (-0.25) & - + in(i+1,j+0) * (0.25) & - + in(i+2,j+0) * (0.125) & - + in(i+0,j+1) * (0.25) & - + in(i+0,j+2) * (0.125) & + + in(i+0,j-2) * (-0.125d0) & + + in(i+0,j-1) * (-0.25d0) & + + in(i-2,j+0) * (-0.125d0) & + + in(i-1,j+0) * (-0.25d0) & + + in(i+1,j+0) * (0.25d0) & + + in(i+2,j+0) * (0.125d0) & + + in(i+0,j+1) * (0.25d0) & + + in(i+0,j+2) * (0.125d0) & +0.0 end do end do @@ -50,18 +50,18 @@ subroutine star3(n, in, out) do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i+0,j-3) * (-0.05555555555555555) & - + in(i+0,j-2) * (-0.08333333333333333) & - + in(i+0,j-1) * (-0.16666666666666666) & - + in(i-3,j+0) * (-0.05555555555555555) & - + in(i-2,j+0) * (-0.08333333333333333) & - + in(i-1,j+0) * (-0.16666666666666666) & - + in(i+1,j+0) * (0.16666666666666666) & - + in(i+2,j+0) * (0.08333333333333333) & - + in(i+3,j+0) * (0.05555555555555555) & - + in(i+0,j+1) * (0.16666666666666666) & - + in(i+0,j+2) * (0.08333333333333333) & - + in(i+0,j+3) * (0.05555555555555555) & + + in(i+0,j-3) * (-0.05555555555555555d0) & + + in(i+0,j-2) * (-0.08333333333333333d0) & + + in(i+0,j-1) * (-0.16666666666666666d0) & + + in(i-3,j+0) * (-0.05555555555555555d0) & + + in(i-2,j+0) * (-0.08333333333333333d0) & + + in(i-1,j+0) * (-0.16666666666666666d0) & + + in(i+1,j+0) * (0.16666666666666666d0) & + + in(i+2,j+0) * (0.08333333333333333d0) & + + in(i+3,j+0) * (0.05555555555555555d0) & + + in(i+0,j+1) * (0.16666666666666666d0) & + + in(i+0,j+2) * (0.08333333333333333d0) & + + in(i+0,j+3) * (0.05555555555555555d0) & +0.0 end do end do @@ -77,22 +77,22 @@ subroutine star4(n, in, out) do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i+0,j-4) * (-0.03125) & - + in(i+0,j-3) * (-0.041666666666666664) & - + in(i+0,j-2) * (-0.0625) & - + in(i+0,j-1) * (-0.125) & - + in(i-4,j+0) * (-0.03125) & - + in(i-3,j+0) * (-0.041666666666666664) & - + in(i-2,j+0) * (-0.0625) & - + in(i-1,j+0) * (-0.125) & - + in(i+1,j+0) * (0.125) & - + in(i+2,j+0) * (0.0625) & - + in(i+3,j+0) * (0.041666666666666664) & - + in(i+4,j+0) * (0.03125) & - + in(i+0,j+1) * (0.125) & - + in(i+0,j+2) * (0.0625) & - + in(i+0,j+3) * (0.041666666666666664) & - + in(i+0,j+4) * (0.03125) & + + in(i+0,j-4) * (-0.03125d0) & + + in(i+0,j-3) * (-0.041666666666666664d0) & + + in(i+0,j-2) * (-0.0625d0) & + + in(i+0,j-1) * (-0.125d0) & + + in(i-4,j+0) * (-0.03125d0) & + + in(i-3,j+0) * (-0.041666666666666664d0) & + + in(i-2,j+0) * (-0.0625d0) & + + in(i-1,j+0) * (-0.125d0) & + + in(i+1,j+0) * (0.125d0) & + + in(i+2,j+0) * (0.0625d0) & + + in(i+3,j+0) * (0.041666666666666664d0) & + + in(i+4,j+0) * (0.03125d0) & + + in(i+0,j+1) * (0.125d0) & + + in(i+0,j+2) * (0.0625d0) & + + in(i+0,j+3) * (0.041666666666666664d0) & + + in(i+0,j+4) * (0.03125d0) & +0.0 end do end do @@ -108,26 +108,26 @@ subroutine star5(n, in, out) do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i+0,j-5) * (-0.02) & - + in(i+0,j-4) * (-0.025) & - + in(i+0,j-3) * (-0.03333333333333333) & - + in(i+0,j-2) * (-0.05) & - + in(i+0,j-1) * (-0.1) & - + in(i-5,j+0) * (-0.02) & - + in(i-4,j+0) * (-0.025) & - + in(i-3,j+0) * (-0.03333333333333333) & - + in(i-2,j+0) * (-0.05) & - + in(i-1,j+0) * (-0.1) & - + in(i+1,j+0) * (0.1) & - + in(i+2,j+0) * (0.05) & - + in(i+3,j+0) * (0.03333333333333333) & - + in(i+4,j+0) * (0.025) & - + in(i+5,j+0) * (0.02) & - + in(i+0,j+1) * (0.1) & - + in(i+0,j+2) * (0.05) & - + in(i+0,j+3) * (0.03333333333333333) & - + in(i+0,j+4) * (0.025) & - + in(i+0,j+5) * (0.02) & + + in(i+0,j-5) * (-0.02d0) & + + in(i+0,j-4) * (-0.025d0) & + + in(i+0,j-3) * (-0.03333333333333333d0) & + + in(i+0,j-2) * (-0.05d0) & + + in(i+0,j-1) * (-0.1d0) & + + in(i-5,j+0) * (-0.02d0) & + + in(i-4,j+0) * (-0.025d0) & + + in(i-3,j+0) * (-0.03333333333333333d0) & + + in(i-2,j+0) * (-0.05d0) & + + in(i-1,j+0) * (-0.1d0) & + + in(i+1,j+0) * (0.1d0) & + + in(i+2,j+0) * (0.05d0) & + + in(i+3,j+0) * (0.03333333333333333d0) & + + in(i+4,j+0) * (0.025d0) & + + in(i+5,j+0) * (0.02d0) & + + in(i+0,j+1) * (0.1d0) & + + in(i+0,j+2) * (0.05d0) & + + in(i+0,j+3) * (0.03333333333333333d0) & + + in(i+0,j+4) * (0.025d0) & + + in(i+0,j+5) * (0.02d0) & +0.0 end do end do @@ -143,30 +143,30 @@ subroutine star6(n, in, out) do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i+0,j-6) * (-0.013888888888888888) & - + in(i+0,j-5) * (-0.016666666666666666) & - + in(i+0,j-4) * (-0.020833333333333332) & - + in(i+0,j-3) * (-0.027777777777777776) & - + in(i+0,j-2) * (-0.041666666666666664) & - + in(i+0,j-1) * (-0.08333333333333333) & - + in(i-6,j+0) * (-0.013888888888888888) & - + in(i-5,j+0) * (-0.016666666666666666) & - + in(i-4,j+0) * (-0.020833333333333332) & - + in(i-3,j+0) * (-0.027777777777777776) & - + in(i-2,j+0) * (-0.041666666666666664) & - + in(i-1,j+0) * (-0.08333333333333333) & - + in(i+1,j+0) * (0.08333333333333333) & - + in(i+2,j+0) * (0.041666666666666664) & - + in(i+3,j+0) * (0.027777777777777776) & - + in(i+4,j+0) * (0.020833333333333332) & - + in(i+5,j+0) * (0.016666666666666666) & - + in(i+6,j+0) * (0.013888888888888888) & - + in(i+0,j+1) * (0.08333333333333333) & - + in(i+0,j+2) * (0.041666666666666664) & - + in(i+0,j+3) * (0.027777777777777776) & - + in(i+0,j+4) * (0.020833333333333332) & - + in(i+0,j+5) * (0.016666666666666666) & - + in(i+0,j+6) * (0.013888888888888888) & + + in(i+0,j-6) * (-0.013888888888888888d0) & + + in(i+0,j-5) * (-0.016666666666666666d0) & + + in(i+0,j-4) * (-0.020833333333333332d0) & + + in(i+0,j-3) * (-0.027777777777777776d0) & + + in(i+0,j-2) * (-0.041666666666666664d0) & + + in(i+0,j-1) * (-0.08333333333333333d0) & + + in(i-6,j+0) * (-0.013888888888888888d0) & + + in(i-5,j+0) * (-0.016666666666666666d0) & + + in(i-4,j+0) * (-0.020833333333333332d0) & + + in(i-3,j+0) * (-0.027777777777777776d0) & + + in(i-2,j+0) * (-0.041666666666666664d0) & + + in(i-1,j+0) * (-0.08333333333333333d0) & + + in(i+1,j+0) * (0.08333333333333333d0) & + + in(i+2,j+0) * (0.041666666666666664d0) & + + in(i+3,j+0) * (0.027777777777777776d0) & + + in(i+4,j+0) * (0.020833333333333332d0) & + + in(i+5,j+0) * (0.016666666666666666d0) & + + in(i+6,j+0) * (0.013888888888888888d0) & + + in(i+0,j+1) * (0.08333333333333333d0) & + + in(i+0,j+2) * (0.041666666666666664d0) & + + in(i+0,j+3) * (0.027777777777777776d0) & + + in(i+0,j+4) * (0.020833333333333332d0) & + + in(i+0,j+5) * (0.016666666666666666d0) & + + in(i+0,j+6) * (0.013888888888888888d0) & +0.0 end do end do @@ -182,34 +182,34 @@ subroutine star7(n, in, out) do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i+0,j-7) * (-0.01020408163265306) & - + in(i+0,j-6) * (-0.011904761904761904) & - + in(i+0,j-5) * (-0.014285714285714285) & - + in(i+0,j-4) * (-0.017857142857142856) & - + in(i+0,j-3) * (-0.023809523809523808) & - + in(i+0,j-2) * (-0.03571428571428571) & - + in(i+0,j-1) * (-0.07142857142857142) & - + in(i-7,j+0) * (-0.01020408163265306) & - + in(i-6,j+0) * (-0.011904761904761904) & - + in(i-5,j+0) * (-0.014285714285714285) & - + in(i-4,j+0) * (-0.017857142857142856) & - + in(i-3,j+0) * (-0.023809523809523808) & - + in(i-2,j+0) * (-0.03571428571428571) & - + in(i-1,j+0) * (-0.07142857142857142) & - + in(i+1,j+0) * (0.07142857142857142) & - + in(i+2,j+0) * (0.03571428571428571) & - + in(i+3,j+0) * (0.023809523809523808) & - + in(i+4,j+0) * (0.017857142857142856) & - + in(i+5,j+0) * (0.014285714285714285) & - + in(i+6,j+0) * (0.011904761904761904) & - + in(i+7,j+0) * (0.01020408163265306) & - + in(i+0,j+1) * (0.07142857142857142) & - + in(i+0,j+2) * (0.03571428571428571) & - + in(i+0,j+3) * (0.023809523809523808) & - + in(i+0,j+4) * (0.017857142857142856) & - + in(i+0,j+5) * (0.014285714285714285) & - + in(i+0,j+6) * (0.011904761904761904) & - + in(i+0,j+7) * (0.01020408163265306) & + + in(i+0,j-7) * (-0.01020408163265306d0) & + + in(i+0,j-6) * (-0.011904761904761904d0) & + + in(i+0,j-5) * (-0.014285714285714285d0) & + + in(i+0,j-4) * (-0.017857142857142856d0) & + + in(i+0,j-3) * (-0.023809523809523808d0) & + + in(i+0,j-2) * (-0.03571428571428571d0) & + + in(i+0,j-1) * (-0.07142857142857142d0) & + + in(i-7,j+0) * (-0.01020408163265306d0) & + + in(i-6,j+0) * (-0.011904761904761904d0) & + + in(i-5,j+0) * (-0.014285714285714285d0) & + + in(i-4,j+0) * (-0.017857142857142856d0) & + + in(i-3,j+0) * (-0.023809523809523808d0) & + + in(i-2,j+0) * (-0.03571428571428571d0) & + + in(i-1,j+0) * (-0.07142857142857142d0) & + + in(i+1,j+0) * (0.07142857142857142d0) & + + in(i+2,j+0) * (0.03571428571428571d0) & + + in(i+3,j+0) * (0.023809523809523808d0) & + + in(i+4,j+0) * (0.017857142857142856d0) & + + in(i+5,j+0) * (0.014285714285714285d0) & + + in(i+6,j+0) * (0.011904761904761904d0) & + + in(i+7,j+0) * (0.01020408163265306d0) & + + in(i+0,j+1) * (0.07142857142857142d0) & + + in(i+0,j+2) * (0.03571428571428571d0) & + + in(i+0,j+3) * (0.023809523809523808d0) & + + in(i+0,j+4) * (0.017857142857142856d0) & + + in(i+0,j+5) * (0.014285714285714285d0) & + + in(i+0,j+6) * (0.011904761904761904d0) & + + in(i+0,j+7) * (0.01020408163265306d0) & +0.0 end do end do @@ -225,38 +225,38 @@ subroutine star8(n, in, out) do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i+0,j-8) * (-0.0078125) & - + in(i+0,j-7) * (-0.008928571428571428) & - + in(i+0,j-6) * (-0.010416666666666666) & - + in(i+0,j-5) * (-0.0125) & - + in(i+0,j-4) * (-0.015625) & - + in(i+0,j-3) * (-0.020833333333333332) & - + in(i+0,j-2) * (-0.03125) & - + in(i+0,j-1) * (-0.0625) & - + in(i-8,j+0) * (-0.0078125) & - + in(i-7,j+0) * (-0.008928571428571428) & - + in(i-6,j+0) * (-0.010416666666666666) & - + in(i-5,j+0) * (-0.0125) & - + in(i-4,j+0) * (-0.015625) & - + in(i-3,j+0) * (-0.020833333333333332) & - + in(i-2,j+0) * (-0.03125) & - + in(i-1,j+0) * (-0.0625) & - + in(i+1,j+0) * (0.0625) & - + in(i+2,j+0) * (0.03125) & - + in(i+3,j+0) * (0.020833333333333332) & - + in(i+4,j+0) * (0.015625) & - + in(i+5,j+0) * (0.0125) & - + in(i+6,j+0) * (0.010416666666666666) & - + in(i+7,j+0) * (0.008928571428571428) & - + in(i+8,j+0) * (0.0078125) & - + in(i+0,j+1) * (0.0625) & - + in(i+0,j+2) * (0.03125) & - + in(i+0,j+3) * (0.020833333333333332) & - + in(i+0,j+4) * (0.015625) & - + in(i+0,j+5) * (0.0125) & - + in(i+0,j+6) * (0.010416666666666666) & - + in(i+0,j+7) * (0.008928571428571428) & - + in(i+0,j+8) * (0.0078125) & + + in(i+0,j-8) * (-0.0078125d0) & + + in(i+0,j-7) * (-0.008928571428571428d0) & + + in(i+0,j-6) * (-0.010416666666666666d0) & + + in(i+0,j-5) * (-0.0125d0) & + + in(i+0,j-4) * (-0.015625d0) & + + in(i+0,j-3) * (-0.020833333333333332d0) & + + in(i+0,j-2) * (-0.03125d0) & + + in(i+0,j-1) * (-0.0625d0) & + + in(i-8,j+0) * (-0.0078125d0) & + + in(i-7,j+0) * (-0.008928571428571428d0) & + + in(i-6,j+0) * (-0.010416666666666666d0) & + + in(i-5,j+0) * (-0.0125d0) & + + in(i-4,j+0) * (-0.015625d0) & + + in(i-3,j+0) * (-0.020833333333333332d0) & + + in(i-2,j+0) * (-0.03125d0) & + + in(i-1,j+0) * (-0.0625d0) & + + in(i+1,j+0) * (0.0625d0) & + + in(i+2,j+0) * (0.03125d0) & + + in(i+3,j+0) * (0.020833333333333332d0) & + + in(i+4,j+0) * (0.015625d0) & + + in(i+5,j+0) * (0.0125d0) & + + in(i+6,j+0) * (0.010416666666666666d0) & + + in(i+7,j+0) * (0.008928571428571428d0) & + + in(i+8,j+0) * (0.0078125d0) & + + in(i+0,j+1) * (0.0625d0) & + + in(i+0,j+2) * (0.03125d0) & + + in(i+0,j+3) * (0.020833333333333332d0) & + + in(i+0,j+4) * (0.015625d0) & + + in(i+0,j+5) * (0.0125d0) & + + in(i+0,j+6) * (0.010416666666666666d0) & + + in(i+0,j+7) * (0.008928571428571428d0) & + + in(i+0,j+8) * (0.0078125d0) & +0.0 end do end do @@ -272,42 +272,42 @@ subroutine star9(n, in, out) do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i+0,j-9) * (-0.006172839506172839) & - + in(i+0,j-8) * (-0.006944444444444444) & - + in(i+0,j-7) * (-0.007936507936507936) & - + in(i+0,j-6) * (-0.009259259259259259) & - + in(i+0,j-5) * (-0.011111111111111112) & - + in(i+0,j-4) * (-0.013888888888888888) & - + in(i+0,j-3) * (-0.018518518518518517) & - + in(i+0,j-2) * (-0.027777777777777776) & - + in(i+0,j-1) * (-0.05555555555555555) & - + in(i-9,j+0) * (-0.006172839506172839) & - + in(i-8,j+0) * (-0.006944444444444444) & - + in(i-7,j+0) * (-0.007936507936507936) & - + in(i-6,j+0) * (-0.009259259259259259) & - + in(i-5,j+0) * (-0.011111111111111112) & - + in(i-4,j+0) * (-0.013888888888888888) & - + in(i-3,j+0) * (-0.018518518518518517) & - + in(i-2,j+0) * (-0.027777777777777776) & - + in(i-1,j+0) * (-0.05555555555555555) & - + in(i+1,j+0) * (0.05555555555555555) & - + in(i+2,j+0) * (0.027777777777777776) & - + in(i+3,j+0) * (0.018518518518518517) & - + in(i+4,j+0) * (0.013888888888888888) & - + in(i+5,j+0) * (0.011111111111111112) & - + in(i+6,j+0) * (0.009259259259259259) & - + in(i+7,j+0) * (0.007936507936507936) & - + in(i+8,j+0) * (0.006944444444444444) & - + in(i+9,j+0) * (0.006172839506172839) & - + in(i+0,j+1) * (0.05555555555555555) & - + in(i+0,j+2) * (0.027777777777777776) & - + in(i+0,j+3) * (0.018518518518518517) & - + in(i+0,j+4) * (0.013888888888888888) & - + in(i+0,j+5) * (0.011111111111111112) & - + in(i+0,j+6) * (0.009259259259259259) & - + in(i+0,j+7) * (0.007936507936507936) & - + in(i+0,j+8) * (0.006944444444444444) & - + in(i+0,j+9) * (0.006172839506172839) & + + in(i+0,j-9) * (-0.006172839506172839d0) & + + in(i+0,j-8) * (-0.006944444444444444d0) & + + in(i+0,j-7) * (-0.007936507936507936d0) & + + in(i+0,j-6) * (-0.009259259259259259d0) & + + in(i+0,j-5) * (-0.011111111111111112d0) & + + in(i+0,j-4) * (-0.013888888888888888d0) & + + in(i+0,j-3) * (-0.018518518518518517d0) & + + in(i+0,j-2) * (-0.027777777777777776d0) & + + in(i+0,j-1) * (-0.05555555555555555d0) & + + in(i-9,j+0) * (-0.006172839506172839d0) & + + in(i-8,j+0) * (-0.006944444444444444d0) & + + in(i-7,j+0) * (-0.007936507936507936d0) & + + in(i-6,j+0) * (-0.009259259259259259d0) & + + in(i-5,j+0) * (-0.011111111111111112d0) & + + in(i-4,j+0) * (-0.013888888888888888d0) & + + in(i-3,j+0) * (-0.018518518518518517d0) & + + in(i-2,j+0) * (-0.027777777777777776d0) & + + in(i-1,j+0) * (-0.05555555555555555d0) & + + in(i+1,j+0) * (0.05555555555555555d0) & + + in(i+2,j+0) * (0.027777777777777776d0) & + + in(i+3,j+0) * (0.018518518518518517d0) & + + in(i+4,j+0) * (0.013888888888888888d0) & + + in(i+5,j+0) * (0.011111111111111112d0) & + + in(i+6,j+0) * (0.009259259259259259d0) & + + in(i+7,j+0) * (0.007936507936507936d0) & + + in(i+8,j+0) * (0.006944444444444444d0) & + + in(i+9,j+0) * (0.006172839506172839d0) & + + in(i+0,j+1) * (0.05555555555555555d0) & + + in(i+0,j+2) * (0.027777777777777776d0) & + + in(i+0,j+3) * (0.018518518518518517d0) & + + in(i+0,j+4) * (0.013888888888888888d0) & + + in(i+0,j+5) * (0.011111111111111112d0) & + + in(i+0,j+6) * (0.009259259259259259d0) & + + in(i+0,j+7) * (0.007936507936507936d0) & + + in(i+0,j+8) * (0.006944444444444444d0) & + + in(i+0,j+9) * (0.006172839506172839d0) & +0.0 end do end do @@ -323,10 +323,10 @@ subroutine grid1(n, in, out) do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i-1,j-1) * (-0.25) & - + in(i+1,j-1) * (-0.25) & - + in(i-1,j+1) * (-0.25) & - + in(i+1,j+1) * (0.25) & + + in(i-1,j-1) * (-0.25d0) & + + in(i+1,j-1) * (-0.25d0) & + + in(i-1,j+1) * (-0.25d0) & + + in(i+1,j+1) * (0.25d0) & +0.0 end do end do @@ -342,20 +342,20 @@ subroutine grid2(n, in, out) do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i-2,j-2) * (-0.0625) & - + in(i+1,j-2) * (-0.020833333333333332) & - + in(i+2,j-2) * (-0.020833333333333332) & - + in(i-1,j-1) * (-0.125) & - + in(i+1,j-1) * (-0.125) & - + in(i+2,j-1) * (-0.125) & - + in(i-2,j+1) * (-0.020833333333333332) & - + in(i-1,j+1) * (-0.125) & - + in(i+1,j+1) * (0.125) & - + in(i+2,j+1) * (0.020833333333333332) & - + in(i-2,j+2) * (-0.020833333333333332) & - + in(i-1,j+2) * (-0.125) & - + in(i+1,j+2) * (0.020833333333333332) & - + in(i+2,j+2) * (0.0625) & + + in(i-2,j-2) * (-0.0625d0) & + + in(i+1,j-2) * (-0.020833333333333332d0) & + + in(i+2,j-2) * (-0.020833333333333332d0) & + + in(i-1,j-1) * (-0.125d0) & + + in(i+1,j-1) * (-0.125d0) & + + in(i+2,j-1) * (-0.125d0) & + + in(i-2,j+1) * (-0.020833333333333332d0) & + + in(i-1,j+1) * (-0.125d0) & + + in(i+1,j+1) * (0.125d0) & + + in(i+2,j+1) * (0.020833333333333332d0) & + + in(i-2,j+2) * (-0.020833333333333332d0) & + + in(i-1,j+2) * (-0.125d0) & + + in(i+1,j+2) * (0.020833333333333332d0) & + + in(i+2,j+2) * (0.0625d0) & +0.0 end do end do @@ -371,36 +371,36 @@ subroutine grid3(n, in, out) do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i-3,j-3) * (-0.027777777777777776) & - + in(i+1,j-3) * (-0.005555555555555556) & - + in(i+2,j-3) * (-0.005555555555555556) & - + in(i+3,j-3) * (-0.005555555555555556) & - + in(i-2,j-2) * (-0.041666666666666664) & - + in(i+1,j-2) * (-0.013888888888888888) & - + in(i+2,j-2) * (-0.013888888888888888) & - + in(i+3,j-2) * (-0.013888888888888888) & - + in(i-1,j-1) * (-0.08333333333333333) & - + in(i+1,j-1) * (-0.08333333333333333) & - + in(i+2,j-1) * (-0.08333333333333333) & - + in(i+3,j-1) * (-0.08333333333333333) & - + in(i-3,j+1) * (-0.005555555555555556) & - + in(i-2,j+1) * (-0.013888888888888888) & - + in(i-1,j+1) * (-0.08333333333333333) & - + in(i+1,j+1) * (0.08333333333333333) & - + in(i+2,j+1) * (0.013888888888888888) & - + in(i+3,j+1) * (0.005555555555555556) & - + in(i-3,j+2) * (-0.005555555555555556) & - + in(i-2,j+2) * (-0.013888888888888888) & - + in(i-1,j+2) * (-0.08333333333333333) & - + in(i+1,j+2) * (0.013888888888888888) & - + in(i+2,j+2) * (0.041666666666666664) & - + in(i+3,j+2) * (0.005555555555555556) & - + in(i-3,j+3) * (-0.005555555555555556) & - + in(i-2,j+3) * (-0.013888888888888888) & - + in(i-1,j+3) * (-0.08333333333333333) & - + in(i+1,j+3) * (0.005555555555555556) & - + in(i+2,j+3) * (0.005555555555555556) & - + in(i+3,j+3) * (0.027777777777777776) & + + in(i-3,j-3) * (-0.027777777777777776d0) & + + in(i+1,j-3) * (-0.005555555555555556d0) & + + in(i+2,j-3) * (-0.005555555555555556d0) & + + in(i+3,j-3) * (-0.005555555555555556d0) & + + in(i-2,j-2) * (-0.041666666666666664d0) & + + in(i+1,j-2) * (-0.013888888888888888d0) & + + in(i+2,j-2) * (-0.013888888888888888d0) & + + in(i+3,j-2) * (-0.013888888888888888d0) & + + in(i-1,j-1) * (-0.08333333333333333d0) & + + in(i+1,j-1) * (-0.08333333333333333d0) & + + in(i+2,j-1) * (-0.08333333333333333d0) & + + in(i+3,j-1) * (-0.08333333333333333d0) & + + in(i-3,j+1) * (-0.005555555555555556d0) & + + in(i-2,j+1) * (-0.013888888888888888d0) & + + in(i-1,j+1) * (-0.08333333333333333d0) & + + in(i+1,j+1) * (0.08333333333333333d0) & + + in(i+2,j+1) * (0.013888888888888888d0) & + + in(i+3,j+1) * (0.005555555555555556d0) & + + in(i-3,j+2) * (-0.005555555555555556d0) & + + in(i-2,j+2) * (-0.013888888888888888d0) & + + in(i-1,j+2) * (-0.08333333333333333d0) & + + in(i+1,j+2) * (0.013888888888888888d0) & + + in(i+2,j+2) * (0.041666666666666664d0) & + + in(i+3,j+2) * (0.005555555555555556d0) & + + in(i-3,j+3) * (-0.005555555555555556d0) & + + in(i-2,j+3) * (-0.013888888888888888d0) & + + in(i-1,j+3) * (-0.08333333333333333d0) & + + in(i+1,j+3) * (0.005555555555555556d0) & + + in(i+2,j+3) * (0.005555555555555556d0) & + + in(i+3,j+3) * (0.027777777777777776d0) & +0.0 end do end do @@ -416,58 +416,58 @@ subroutine grid4(n, in, out) do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i-4,j-4) * (-0.015625) & - + in(i+1,j-4) * (-0.002232142857142857) & - + in(i+2,j-4) * (-0.002232142857142857) & - + in(i+3,j-4) * (-0.002232142857142857) & - + in(i+4,j-4) * (-0.002232142857142857) & - + in(i-3,j-3) * (-0.020833333333333332) & - + in(i+1,j-3) * (-0.004166666666666667) & - + in(i+2,j-3) * (-0.004166666666666667) & - + in(i+3,j-3) * (-0.004166666666666667) & - + in(i+4,j-3) * (-0.004166666666666667) & - + in(i-2,j-2) * (-0.03125) & - + in(i+1,j-2) * (-0.010416666666666666) & - + in(i+2,j-2) * (-0.010416666666666666) & - + in(i+3,j-2) * (-0.010416666666666666) & - + in(i+4,j-2) * (-0.010416666666666666) & - + in(i-1,j-1) * (-0.0625) & - + in(i+1,j-1) * (-0.0625) & - + in(i+2,j-1) * (-0.0625) & - + in(i+3,j-1) * (-0.0625) & - + in(i+4,j-1) * (-0.0625) & - + in(i-4,j+1) * (-0.002232142857142857) & - + in(i-3,j+1) * (-0.004166666666666667) & - + in(i-2,j+1) * (-0.010416666666666666) & - + in(i-1,j+1) * (-0.0625) & - + in(i+1,j+1) * (0.0625) & - + in(i+2,j+1) * (0.010416666666666666) & - + in(i+3,j+1) * (0.004166666666666667) & - + in(i+4,j+1) * (0.002232142857142857) & - + in(i-4,j+2) * (-0.002232142857142857) & - + in(i-3,j+2) * (-0.004166666666666667) & - + in(i-2,j+2) * (-0.010416666666666666) & - + in(i-1,j+2) * (-0.0625) & - + in(i+1,j+2) * (0.010416666666666666) & - + in(i+2,j+2) * (0.03125) & - + in(i+3,j+2) * (0.004166666666666667) & - + in(i+4,j+2) * (0.002232142857142857) & - + in(i-4,j+3) * (-0.002232142857142857) & - + in(i-3,j+3) * (-0.004166666666666667) & - + in(i-2,j+3) * (-0.010416666666666666) & - + in(i-1,j+3) * (-0.0625) & - + in(i+1,j+3) * (0.004166666666666667) & - + in(i+2,j+3) * (0.004166666666666667) & - + in(i+3,j+3) * (0.020833333333333332) & - + in(i+4,j+3) * (0.002232142857142857) & - + in(i-4,j+4) * (-0.002232142857142857) & - + in(i-3,j+4) * (-0.004166666666666667) & - + in(i-2,j+4) * (-0.010416666666666666) & - + in(i-1,j+4) * (-0.0625) & - + in(i+1,j+4) * (0.002232142857142857) & - + in(i+2,j+4) * (0.002232142857142857) & - + in(i+3,j+4) * (0.002232142857142857) & - + in(i+4,j+4) * (0.015625) & + + in(i-4,j-4) * (-0.015625d0) & + + in(i+1,j-4) * (-0.002232142857142857d0) & + + in(i+2,j-4) * (-0.002232142857142857d0) & + + in(i+3,j-4) * (-0.002232142857142857d0) & + + in(i+4,j-4) * (-0.002232142857142857d0) & + + in(i-3,j-3) * (-0.020833333333333332d0) & + + in(i+1,j-3) * (-0.004166666666666667d0) & + + in(i+2,j-3) * (-0.004166666666666667d0) & + + in(i+3,j-3) * (-0.004166666666666667d0) & + + in(i+4,j-3) * (-0.004166666666666667d0) & + + in(i-2,j-2) * (-0.03125d0) & + + in(i+1,j-2) * (-0.010416666666666666d0) & + + in(i+2,j-2) * (-0.010416666666666666d0) & + + in(i+3,j-2) * (-0.010416666666666666d0) & + + in(i+4,j-2) * (-0.010416666666666666d0) & + + in(i-1,j-1) * (-0.0625d0) & + + in(i+1,j-1) * (-0.0625d0) & + + in(i+2,j-1) * (-0.0625d0) & + + in(i+3,j-1) * (-0.0625d0) & + + in(i+4,j-1) * (-0.0625d0) & + + in(i-4,j+1) * (-0.002232142857142857d0) & + + in(i-3,j+1) * (-0.004166666666666667d0) & + + in(i-2,j+1) * (-0.010416666666666666d0) & + + in(i-1,j+1) * (-0.0625d0) & + + in(i+1,j+1) * (0.0625d0) & + + in(i+2,j+1) * (0.010416666666666666d0) & + + in(i+3,j+1) * (0.004166666666666667d0) & + + in(i+4,j+1) * (0.002232142857142857d0) & + + in(i-4,j+2) * (-0.002232142857142857d0) & + + in(i-3,j+2) * (-0.004166666666666667d0) & + + in(i-2,j+2) * (-0.010416666666666666d0) & + + in(i-1,j+2) * (-0.0625d0) & + + in(i+1,j+2) * (0.010416666666666666d0) & + + in(i+2,j+2) * (0.03125d0) & + + in(i+3,j+2) * (0.004166666666666667d0) & + + in(i+4,j+2) * (0.002232142857142857d0) & + + in(i-4,j+3) * (-0.002232142857142857d0) & + + in(i-3,j+3) * (-0.004166666666666667d0) & + + in(i-2,j+3) * (-0.010416666666666666d0) & + + in(i-1,j+3) * (-0.0625d0) & + + in(i+1,j+3) * (0.004166666666666667d0) & + + in(i+2,j+3) * (0.004166666666666667d0) & + + in(i+3,j+3) * (0.020833333333333332d0) & + + in(i+4,j+3) * (0.002232142857142857d0) & + + in(i-4,j+4) * (-0.002232142857142857d0) & + + in(i-3,j+4) * (-0.004166666666666667d0) & + + in(i-2,j+4) * (-0.010416666666666666d0) & + + in(i-1,j+4) * (-0.0625d0) & + + in(i+1,j+4) * (0.002232142857142857d0) & + + in(i+2,j+4) * (0.002232142857142857d0) & + + in(i+3,j+4) * (0.002232142857142857d0) & + + in(i+4,j+4) * (0.015625d0) & +0.0 end do end do @@ -483,86 +483,86 @@ subroutine grid5(n, in, out) do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i-5,j-5) * (-0.01) & - + in(i+1,j-5) * (-0.0011111111111111111) & - + in(i+2,j-5) * (-0.0011111111111111111) & - + in(i+3,j-5) * (-0.0011111111111111111) & - + in(i+4,j-5) * (-0.0011111111111111111) & - + in(i+5,j-5) * (-0.0011111111111111111) & - + in(i-4,j-4) * (-0.0125) & - + in(i+1,j-4) * (-0.0017857142857142857) & - + in(i+2,j-4) * (-0.0017857142857142857) & - + in(i+3,j-4) * (-0.0017857142857142857) & - + in(i+4,j-4) * (-0.0017857142857142857) & - + in(i+5,j-4) * (-0.0017857142857142857) & - + in(i-3,j-3) * (-0.016666666666666666) & - + in(i+1,j-3) * (-0.0033333333333333335) & - + in(i+2,j-3) * (-0.0033333333333333335) & - + in(i+3,j-3) * (-0.0033333333333333335) & - + in(i+4,j-3) * (-0.0033333333333333335) & - + in(i+5,j-3) * (-0.0033333333333333335) & - + in(i-2,j-2) * (-0.025) & - + in(i+1,j-2) * (-0.008333333333333333) & - + in(i+2,j-2) * (-0.008333333333333333) & - + in(i+3,j-2) * (-0.008333333333333333) & - + in(i+4,j-2) * (-0.008333333333333333) & - + in(i+5,j-2) * (-0.008333333333333333) & - + in(i-1,j-1) * (-0.05) & - + in(i+1,j-1) * (-0.05) & - + in(i+2,j-1) * (-0.05) & - + in(i+3,j-1) * (-0.05) & - + in(i+4,j-1) * (-0.05) & - + in(i+5,j-1) * (-0.05) & - + in(i-5,j+1) * (-0.0011111111111111111) & - + in(i-4,j+1) * (-0.0017857142857142857) & - + in(i-3,j+1) * (-0.0033333333333333335) & - + in(i-2,j+1) * (-0.008333333333333333) & - + in(i-1,j+1) * (-0.05) & - + in(i+1,j+1) * (0.05) & - + in(i+2,j+1) * (0.008333333333333333) & - + in(i+3,j+1) * (0.0033333333333333335) & - + in(i+4,j+1) * (0.0017857142857142857) & - + in(i+5,j+1) * (0.0011111111111111111) & - + in(i-5,j+2) * (-0.0011111111111111111) & - + in(i-4,j+2) * (-0.0017857142857142857) & - + in(i-3,j+2) * (-0.0033333333333333335) & - + in(i-2,j+2) * (-0.008333333333333333) & - + in(i-1,j+2) * (-0.05) & - + in(i+1,j+2) * (0.008333333333333333) & - + in(i+2,j+2) * (0.025) & - + in(i+3,j+2) * (0.0033333333333333335) & - + in(i+4,j+2) * (0.0017857142857142857) & - + in(i+5,j+2) * (0.0011111111111111111) & - + in(i-5,j+3) * (-0.0011111111111111111) & - + in(i-4,j+3) * (-0.0017857142857142857) & - + in(i-3,j+3) * (-0.0033333333333333335) & - + in(i-2,j+3) * (-0.008333333333333333) & - + in(i-1,j+3) * (-0.05) & - + in(i+1,j+3) * (0.0033333333333333335) & - + in(i+2,j+3) * (0.0033333333333333335) & - + in(i+3,j+3) * (0.016666666666666666) & - + in(i+4,j+3) * (0.0017857142857142857) & - + in(i+5,j+3) * (0.0011111111111111111) & - + in(i-5,j+4) * (-0.0011111111111111111) & - + in(i-4,j+4) * (-0.0017857142857142857) & - + in(i-3,j+4) * (-0.0033333333333333335) & - + in(i-2,j+4) * (-0.008333333333333333) & - + in(i-1,j+4) * (-0.05) & - + in(i+1,j+4) * (0.0017857142857142857) & - + in(i+2,j+4) * (0.0017857142857142857) & - + in(i+3,j+4) * (0.0017857142857142857) & - + in(i+4,j+4) * (0.0125) & - + in(i+5,j+4) * (0.0011111111111111111) & - + in(i-5,j+5) * (-0.0011111111111111111) & - + in(i-4,j+5) * (-0.0017857142857142857) & - + in(i-3,j+5) * (-0.0033333333333333335) & - + in(i-2,j+5) * (-0.008333333333333333) & - + in(i-1,j+5) * (-0.05) & - + in(i+1,j+5) * (0.0011111111111111111) & - + in(i+2,j+5) * (0.0011111111111111111) & - + in(i+3,j+5) * (0.0011111111111111111) & - + in(i+4,j+5) * (0.0011111111111111111) & - + in(i+5,j+5) * (0.01) & + + in(i-5,j-5) * (-0.01d0) & + + in(i+1,j-5) * (-0.0011111111111111111d0) & + + in(i+2,j-5) * (-0.0011111111111111111d0) & + + in(i+3,j-5) * (-0.0011111111111111111d0) & + + in(i+4,j-5) * (-0.0011111111111111111d0) & + + in(i+5,j-5) * (-0.0011111111111111111d0) & + + in(i-4,j-4) * (-0.0125d0) & + + in(i+1,j-4) * (-0.0017857142857142857d0) & + + in(i+2,j-4) * (-0.0017857142857142857d0) & + + in(i+3,j-4) * (-0.0017857142857142857d0) & + + in(i+4,j-4) * (-0.0017857142857142857d0) & + + in(i+5,j-4) * (-0.0017857142857142857d0) & + + in(i-3,j-3) * (-0.016666666666666666d0) & + + in(i+1,j-3) * (-0.0033333333333333335d0) & + + in(i+2,j-3) * (-0.0033333333333333335d0) & + + in(i+3,j-3) * (-0.0033333333333333335d0) & + + in(i+4,j-3) * (-0.0033333333333333335d0) & + + in(i+5,j-3) * (-0.0033333333333333335d0) & + + in(i-2,j-2) * (-0.025d0) & + + in(i+1,j-2) * (-0.008333333333333333d0) & + + in(i+2,j-2) * (-0.008333333333333333d0) & + + in(i+3,j-2) * (-0.008333333333333333d0) & + + in(i+4,j-2) * (-0.008333333333333333d0) & + + in(i+5,j-2) * (-0.008333333333333333d0) & + + in(i-1,j-1) * (-0.05d0) & + + in(i+1,j-1) * (-0.05d0) & + + in(i+2,j-1) * (-0.05d0) & + + in(i+3,j-1) * (-0.05d0) & + + in(i+4,j-1) * (-0.05d0) & + + in(i+5,j-1) * (-0.05d0) & + + in(i-5,j+1) * (-0.0011111111111111111d0) & + + in(i-4,j+1) * (-0.0017857142857142857d0) & + + in(i-3,j+1) * (-0.0033333333333333335d0) & + + in(i-2,j+1) * (-0.008333333333333333d0) & + + in(i-1,j+1) * (-0.05d0) & + + in(i+1,j+1) * (0.05d0) & + + in(i+2,j+1) * (0.008333333333333333d0) & + + in(i+3,j+1) * (0.0033333333333333335d0) & + + in(i+4,j+1) * (0.0017857142857142857d0) & + + in(i+5,j+1) * (0.0011111111111111111d0) & + + in(i-5,j+2) * (-0.0011111111111111111d0) & + + in(i-4,j+2) * (-0.0017857142857142857d0) & + + in(i-3,j+2) * (-0.0033333333333333335d0) & + + in(i-2,j+2) * (-0.008333333333333333d0) & + + in(i-1,j+2) * (-0.05d0) & + + in(i+1,j+2) * (0.008333333333333333d0) & + + in(i+2,j+2) * (0.025d0) & + + in(i+3,j+2) * (0.0033333333333333335d0) & + + in(i+4,j+2) * (0.0017857142857142857d0) & + + in(i+5,j+2) * (0.0011111111111111111d0) & + + in(i-5,j+3) * (-0.0011111111111111111d0) & + + in(i-4,j+3) * (-0.0017857142857142857d0) & + + in(i-3,j+3) * (-0.0033333333333333335d0) & + + in(i-2,j+3) * (-0.008333333333333333d0) & + + in(i-1,j+3) * (-0.05d0) & + + in(i+1,j+3) * (0.0033333333333333335d0) & + + in(i+2,j+3) * (0.0033333333333333335d0) & + + in(i+3,j+3) * (0.016666666666666666d0) & + + in(i+4,j+3) * (0.0017857142857142857d0) & + + in(i+5,j+3) * (0.0011111111111111111d0) & + + in(i-5,j+4) * (-0.0011111111111111111d0) & + + in(i-4,j+4) * (-0.0017857142857142857d0) & + + in(i-3,j+4) * (-0.0033333333333333335d0) & + + in(i-2,j+4) * (-0.008333333333333333d0) & + + in(i-1,j+4) * (-0.05d0) & + + in(i+1,j+4) * (0.0017857142857142857d0) & + + in(i+2,j+4) * (0.0017857142857142857d0) & + + in(i+3,j+4) * (0.0017857142857142857d0) & + + in(i+4,j+4) * (0.0125d0) & + + in(i+5,j+4) * (0.0011111111111111111d0) & + + in(i-5,j+5) * (-0.0011111111111111111d0) & + + in(i-4,j+5) * (-0.0017857142857142857d0) & + + in(i-3,j+5) * (-0.0033333333333333335d0) & + + in(i-2,j+5) * (-0.008333333333333333d0) & + + in(i-1,j+5) * (-0.05d0) & + + in(i+1,j+5) * (0.0011111111111111111d0) & + + in(i+2,j+5) * (0.0011111111111111111d0) & + + in(i+3,j+5) * (0.0011111111111111111d0) & + + in(i+4,j+5) * (0.0011111111111111111d0) & + + in(i+5,j+5) * (0.01d0) & +0.0 end do end do @@ -578,120 +578,120 @@ subroutine grid6(n, in, out) do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i-6,j-6) * (-0.006944444444444444) & - + in(i+1,j-6) * (-0.0006313131313131314) & - + in(i+2,j-6) * (-0.0006313131313131314) & - + in(i+3,j-6) * (-0.0006313131313131314) & - + in(i+4,j-6) * (-0.0006313131313131314) & - + in(i+5,j-6) * (-0.0006313131313131314) & - + in(i+6,j-6) * (-0.0006313131313131314) & - + in(i-5,j-5) * (-0.008333333333333333) & - + in(i+1,j-5) * (-0.000925925925925926) & - + in(i+2,j-5) * (-0.000925925925925926) & - + in(i+3,j-5) * (-0.000925925925925926) & - + in(i+4,j-5) * (-0.000925925925925926) & - + in(i+5,j-5) * (-0.000925925925925926) & - + in(i+6,j-5) * (-0.000925925925925926) & - + in(i-4,j-4) * (-0.010416666666666666) & - + in(i+1,j-4) * (-0.001488095238095238) & - + in(i+2,j-4) * (-0.001488095238095238) & - + in(i+3,j-4) * (-0.001488095238095238) & - + in(i+4,j-4) * (-0.001488095238095238) & - + in(i+5,j-4) * (-0.001488095238095238) & - + in(i+6,j-4) * (-0.001488095238095238) & - + in(i-3,j-3) * (-0.013888888888888888) & - + in(i+1,j-3) * (-0.002777777777777778) & - + in(i+2,j-3) * (-0.002777777777777778) & - + in(i+3,j-3) * (-0.002777777777777778) & - + in(i+4,j-3) * (-0.002777777777777778) & - + in(i+5,j-3) * (-0.002777777777777778) & - + in(i+6,j-3) * (-0.002777777777777778) & - + in(i-2,j-2) * (-0.020833333333333332) & - + in(i+1,j-2) * (-0.006944444444444444) & - + in(i+2,j-2) * (-0.006944444444444444) & - + in(i+3,j-2) * (-0.006944444444444444) & - + in(i+4,j-2) * (-0.006944444444444444) & - + in(i+5,j-2) * (-0.006944444444444444) & - + in(i+6,j-2) * (-0.006944444444444444) & - + in(i-1,j-1) * (-0.041666666666666664) & - + in(i+1,j-1) * (-0.041666666666666664) & - + in(i+2,j-1) * (-0.041666666666666664) & - + in(i+3,j-1) * (-0.041666666666666664) & - + in(i+4,j-1) * (-0.041666666666666664) & - + in(i+5,j-1) * (-0.041666666666666664) & - + in(i+6,j-1) * (-0.041666666666666664) & - + in(i-6,j+1) * (-0.0006313131313131314) & - + in(i-5,j+1) * (-0.000925925925925926) & - + in(i-4,j+1) * (-0.001488095238095238) & - + in(i-3,j+1) * (-0.002777777777777778) & - + in(i-2,j+1) * (-0.006944444444444444) & - + in(i-1,j+1) * (-0.041666666666666664) & - + in(i+1,j+1) * (0.041666666666666664) & - + in(i+2,j+1) * (0.006944444444444444) & - + in(i+3,j+1) * (0.002777777777777778) & - + in(i+4,j+1) * (0.001488095238095238) & - + in(i+5,j+1) * (0.000925925925925926) & - + in(i+6,j+1) * (0.0006313131313131314) & - + in(i-6,j+2) * (-0.0006313131313131314) & - + in(i-5,j+2) * (-0.000925925925925926) & - + in(i-4,j+2) * (-0.001488095238095238) & - + in(i-3,j+2) * (-0.002777777777777778) & - + in(i-2,j+2) * (-0.006944444444444444) & - + in(i-1,j+2) * (-0.041666666666666664) & - + in(i+1,j+2) * (0.006944444444444444) & - + in(i+2,j+2) * (0.020833333333333332) & - + in(i+3,j+2) * (0.002777777777777778) & - + in(i+4,j+2) * (0.001488095238095238) & - + in(i+5,j+2) * (0.000925925925925926) & - + in(i+6,j+2) * (0.0006313131313131314) & - + in(i-6,j+3) * (-0.0006313131313131314) & - + in(i-5,j+3) * (-0.000925925925925926) & - + in(i-4,j+3) * (-0.001488095238095238) & - + in(i-3,j+3) * (-0.002777777777777778) & - + in(i-2,j+3) * (-0.006944444444444444) & - + in(i-1,j+3) * (-0.041666666666666664) & - + in(i+1,j+3) * (0.002777777777777778) & - + in(i+2,j+3) * (0.002777777777777778) & - + in(i+3,j+3) * (0.013888888888888888) & - + in(i+4,j+3) * (0.001488095238095238) & - + in(i+5,j+3) * (0.000925925925925926) & - + in(i+6,j+3) * (0.0006313131313131314) & - + in(i-6,j+4) * (-0.0006313131313131314) & - + in(i-5,j+4) * (-0.000925925925925926) & - + in(i-4,j+4) * (-0.001488095238095238) & - + in(i-3,j+4) * (-0.002777777777777778) & - + in(i-2,j+4) * (-0.006944444444444444) & - + in(i-1,j+4) * (-0.041666666666666664) & - + in(i+1,j+4) * (0.001488095238095238) & - + in(i+2,j+4) * (0.001488095238095238) & - + in(i+3,j+4) * (0.001488095238095238) & - + in(i+4,j+4) * (0.010416666666666666) & - + in(i+5,j+4) * (0.000925925925925926) & - + in(i+6,j+4) * (0.0006313131313131314) & - + in(i-6,j+5) * (-0.0006313131313131314) & - + in(i-5,j+5) * (-0.000925925925925926) & - + in(i-4,j+5) * (-0.001488095238095238) & - + in(i-3,j+5) * (-0.002777777777777778) & - + in(i-2,j+5) * (-0.006944444444444444) & - + in(i-1,j+5) * (-0.041666666666666664) & - + in(i+1,j+5) * (0.000925925925925926) & - + in(i+2,j+5) * (0.000925925925925926) & - + in(i+3,j+5) * (0.000925925925925926) & - + in(i+4,j+5) * (0.000925925925925926) & - + in(i+5,j+5) * (0.008333333333333333) & - + in(i+6,j+5) * (0.0006313131313131314) & - + in(i-6,j+6) * (-0.0006313131313131314) & - + in(i-5,j+6) * (-0.000925925925925926) & - + in(i-4,j+6) * (-0.001488095238095238) & - + in(i-3,j+6) * (-0.002777777777777778) & - + in(i-2,j+6) * (-0.006944444444444444) & - + in(i-1,j+6) * (-0.041666666666666664) & - + in(i+1,j+6) * (0.0006313131313131314) & - + in(i+2,j+6) * (0.0006313131313131314) & - + in(i+3,j+6) * (0.0006313131313131314) & - + in(i+4,j+6) * (0.0006313131313131314) & - + in(i+5,j+6) * (0.0006313131313131314) & - + in(i+6,j+6) * (0.006944444444444444) & + + in(i-6,j-6) * (-0.006944444444444444d0) & + + in(i+1,j-6) * (-0.0006313131313131314d0) & + + in(i+2,j-6) * (-0.0006313131313131314d0) & + + in(i+3,j-6) * (-0.0006313131313131314d0) & + + in(i+4,j-6) * (-0.0006313131313131314d0) & + + in(i+5,j-6) * (-0.0006313131313131314d0) & + + in(i+6,j-6) * (-0.0006313131313131314d0) & + + in(i-5,j-5) * (-0.008333333333333333d0) & + + in(i+1,j-5) * (-0.000925925925925926d0) & + + in(i+2,j-5) * (-0.000925925925925926d0) & + + in(i+3,j-5) * (-0.000925925925925926d0) & + + in(i+4,j-5) * (-0.000925925925925926d0) & + + in(i+5,j-5) * (-0.000925925925925926d0) & + + in(i+6,j-5) * (-0.000925925925925926d0) & + + in(i-4,j-4) * (-0.010416666666666666d0) & + + in(i+1,j-4) * (-0.001488095238095238d0) & + + in(i+2,j-4) * (-0.001488095238095238d0) & + + in(i+3,j-4) * (-0.001488095238095238d0) & + + in(i+4,j-4) * (-0.001488095238095238d0) & + + in(i+5,j-4) * (-0.001488095238095238d0) & + + in(i+6,j-4) * (-0.001488095238095238d0) & + + in(i-3,j-3) * (-0.013888888888888888d0) & + + in(i+1,j-3) * (-0.002777777777777778d0) & + + in(i+2,j-3) * (-0.002777777777777778d0) & + + in(i+3,j-3) * (-0.002777777777777778d0) & + + in(i+4,j-3) * (-0.002777777777777778d0) & + + in(i+5,j-3) * (-0.002777777777777778d0) & + + in(i+6,j-3) * (-0.002777777777777778d0) & + + in(i-2,j-2) * (-0.020833333333333332d0) & + + in(i+1,j-2) * (-0.006944444444444444d0) & + + in(i+2,j-2) * (-0.006944444444444444d0) & + + in(i+3,j-2) * (-0.006944444444444444d0) & + + in(i+4,j-2) * (-0.006944444444444444d0) & + + in(i+5,j-2) * (-0.006944444444444444d0) & + + in(i+6,j-2) * (-0.006944444444444444d0) & + + in(i-1,j-1) * (-0.041666666666666664d0) & + + in(i+1,j-1) * (-0.041666666666666664d0) & + + in(i+2,j-1) * (-0.041666666666666664d0) & + + in(i+3,j-1) * (-0.041666666666666664d0) & + + in(i+4,j-1) * (-0.041666666666666664d0) & + + in(i+5,j-1) * (-0.041666666666666664d0) & + + in(i+6,j-1) * (-0.041666666666666664d0) & + + in(i-6,j+1) * (-0.0006313131313131314d0) & + + in(i-5,j+1) * (-0.000925925925925926d0) & + + in(i-4,j+1) * (-0.001488095238095238d0) & + + in(i-3,j+1) * (-0.002777777777777778d0) & + + in(i-2,j+1) * (-0.006944444444444444d0) & + + in(i-1,j+1) * (-0.041666666666666664d0) & + + in(i+1,j+1) * (0.041666666666666664d0) & + + in(i+2,j+1) * (0.006944444444444444d0) & + + in(i+3,j+1) * (0.002777777777777778d0) & + + in(i+4,j+1) * (0.001488095238095238d0) & + + in(i+5,j+1) * (0.000925925925925926d0) & + + in(i+6,j+1) * (0.0006313131313131314d0) & + + in(i-6,j+2) * (-0.0006313131313131314d0) & + + in(i-5,j+2) * (-0.000925925925925926d0) & + + in(i-4,j+2) * (-0.001488095238095238d0) & + + in(i-3,j+2) * (-0.002777777777777778d0) & + + in(i-2,j+2) * (-0.006944444444444444d0) & + + in(i-1,j+2) * (-0.041666666666666664d0) & + + in(i+1,j+2) * (0.006944444444444444d0) & + + in(i+2,j+2) * (0.020833333333333332d0) & + + in(i+3,j+2) * (0.002777777777777778d0) & + + in(i+4,j+2) * (0.001488095238095238d0) & + + in(i+5,j+2) * (0.000925925925925926d0) & + + in(i+6,j+2) * (0.0006313131313131314d0) & + + in(i-6,j+3) * (-0.0006313131313131314d0) & + + in(i-5,j+3) * (-0.000925925925925926d0) & + + in(i-4,j+3) * (-0.001488095238095238d0) & + + in(i-3,j+3) * (-0.002777777777777778d0) & + + in(i-2,j+3) * (-0.006944444444444444d0) & + + in(i-1,j+3) * (-0.041666666666666664d0) & + + in(i+1,j+3) * (0.002777777777777778d0) & + + in(i+2,j+3) * (0.002777777777777778d0) & + + in(i+3,j+3) * (0.013888888888888888d0) & + + in(i+4,j+3) * (0.001488095238095238d0) & + + in(i+5,j+3) * (0.000925925925925926d0) & + + in(i+6,j+3) * (0.0006313131313131314d0) & + + in(i-6,j+4) * (-0.0006313131313131314d0) & + + in(i-5,j+4) * (-0.000925925925925926d0) & + + in(i-4,j+4) * (-0.001488095238095238d0) & + + in(i-3,j+4) * (-0.002777777777777778d0) & + + in(i-2,j+4) * (-0.006944444444444444d0) & + + in(i-1,j+4) * (-0.041666666666666664d0) & + + in(i+1,j+4) * (0.001488095238095238d0) & + + in(i+2,j+4) * (0.001488095238095238d0) & + + in(i+3,j+4) * (0.001488095238095238d0) & + + in(i+4,j+4) * (0.010416666666666666d0) & + + in(i+5,j+4) * (0.000925925925925926d0) & + + in(i+6,j+4) * (0.0006313131313131314d0) & + + in(i-6,j+5) * (-0.0006313131313131314d0) & + + in(i-5,j+5) * (-0.000925925925925926d0) & + + in(i-4,j+5) * (-0.001488095238095238d0) & + + in(i-3,j+5) * (-0.002777777777777778d0) & + + in(i-2,j+5) * (-0.006944444444444444d0) & + + in(i-1,j+5) * (-0.041666666666666664d0) & + + in(i+1,j+5) * (0.000925925925925926d0) & + + in(i+2,j+5) * (0.000925925925925926d0) & + + in(i+3,j+5) * (0.000925925925925926d0) & + + in(i+4,j+5) * (0.000925925925925926d0) & + + in(i+5,j+5) * (0.008333333333333333d0) & + + in(i+6,j+5) * (0.0006313131313131314d0) & + + in(i-6,j+6) * (-0.0006313131313131314d0) & + + in(i-5,j+6) * (-0.000925925925925926d0) & + + in(i-4,j+6) * (-0.001488095238095238d0) & + + in(i-3,j+6) * (-0.002777777777777778d0) & + + in(i-2,j+6) * (-0.006944444444444444d0) & + + in(i-1,j+6) * (-0.041666666666666664d0) & + + in(i+1,j+6) * (0.0006313131313131314d0) & + + in(i+2,j+6) * (0.0006313131313131314d0) & + + in(i+3,j+6) * (0.0006313131313131314d0) & + + in(i+4,j+6) * (0.0006313131313131314d0) & + + in(i+5,j+6) * (0.0006313131313131314d0) & + + in(i+6,j+6) * (0.006944444444444444d0) & +0.0 end do end do @@ -707,160 +707,160 @@ subroutine grid7(n, in, out) do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i-7,j-7) * (-0.00510204081632653) & - + in(i+1,j-7) * (-0.0003924646781789639) & - + in(i+2,j-7) * (-0.0003924646781789639) & - + in(i+3,j-7) * (-0.0003924646781789639) & - + in(i+4,j-7) * (-0.0003924646781789639) & - + in(i+5,j-7) * (-0.0003924646781789639) & - + in(i+6,j-7) * (-0.0003924646781789639) & - + in(i+7,j-7) * (-0.0003924646781789639) & - + in(i-6,j-6) * (-0.005952380952380952) & - + in(i+1,j-6) * (-0.0005411255411255411) & - + in(i+2,j-6) * (-0.0005411255411255411) & - + in(i+3,j-6) * (-0.0005411255411255411) & - + in(i+4,j-6) * (-0.0005411255411255411) & - + in(i+5,j-6) * (-0.0005411255411255411) & - + in(i+6,j-6) * (-0.0005411255411255411) & - + in(i+7,j-6) * (-0.0005411255411255411) & - + in(i-5,j-5) * (-0.007142857142857143) & - + in(i+1,j-5) * (-0.0007936507936507937) & - + in(i+2,j-5) * (-0.0007936507936507937) & - + in(i+3,j-5) * (-0.0007936507936507937) & - + in(i+4,j-5) * (-0.0007936507936507937) & - + in(i+5,j-5) * (-0.0007936507936507937) & - + in(i+6,j-5) * (-0.0007936507936507937) & - + in(i+7,j-5) * (-0.0007936507936507937) & - + in(i-4,j-4) * (-0.008928571428571428) & - + in(i+1,j-4) * (-0.0012755102040816326) & - + in(i+2,j-4) * (-0.0012755102040816326) & - + in(i+3,j-4) * (-0.0012755102040816326) & - + in(i+4,j-4) * (-0.0012755102040816326) & - + in(i+5,j-4) * (-0.0012755102040816326) & - + in(i+6,j-4) * (-0.0012755102040816326) & - + in(i+7,j-4) * (-0.0012755102040816326) & - + in(i-3,j-3) * (-0.011904761904761904) & - + in(i+1,j-3) * (-0.002380952380952381) & - + in(i+2,j-3) * (-0.002380952380952381) & - + in(i+3,j-3) * (-0.002380952380952381) & - + in(i+4,j-3) * (-0.002380952380952381) & - + in(i+5,j-3) * (-0.002380952380952381) & - + in(i+6,j-3) * (-0.002380952380952381) & - + in(i+7,j-3) * (-0.002380952380952381) & - + in(i-2,j-2) * (-0.017857142857142856) & - + in(i+1,j-2) * (-0.005952380952380952) & - + in(i+2,j-2) * (-0.005952380952380952) & - + in(i+3,j-2) * (-0.005952380952380952) & - + in(i+4,j-2) * (-0.005952380952380952) & - + in(i+5,j-2) * (-0.005952380952380952) & - + in(i+6,j-2) * (-0.005952380952380952) & - + in(i+7,j-2) * (-0.005952380952380952) & - + in(i-1,j-1) * (-0.03571428571428571) & - + in(i+1,j-1) * (-0.03571428571428571) & - + in(i+2,j-1) * (-0.03571428571428571) & - + in(i+3,j-1) * (-0.03571428571428571) & - + in(i+4,j-1) * (-0.03571428571428571) & - + in(i+5,j-1) * (-0.03571428571428571) & - + in(i+6,j-1) * (-0.03571428571428571) & - + in(i+7,j-1) * (-0.03571428571428571) & - + in(i-7,j+1) * (-0.0003924646781789639) & - + in(i-6,j+1) * (-0.0005411255411255411) & - + in(i-5,j+1) * (-0.0007936507936507937) & - + in(i-4,j+1) * (-0.0012755102040816326) & - + in(i-3,j+1) * (-0.002380952380952381) & - + in(i-2,j+1) * (-0.005952380952380952) & - + in(i-1,j+1) * (-0.03571428571428571) & - + in(i+1,j+1) * (0.03571428571428571) & - + in(i+2,j+1) * (0.005952380952380952) & - + in(i+3,j+1) * (0.002380952380952381) & - + in(i+4,j+1) * (0.0012755102040816326) & - + in(i+5,j+1) * (0.0007936507936507937) & - + in(i+6,j+1) * (0.0005411255411255411) & - + in(i+7,j+1) * (0.0003924646781789639) & - + in(i-7,j+2) * (-0.0003924646781789639) & - + in(i-6,j+2) * (-0.0005411255411255411) & - + in(i-5,j+2) * (-0.0007936507936507937) & - + in(i-4,j+2) * (-0.0012755102040816326) & - + in(i-3,j+2) * (-0.002380952380952381) & - + in(i-2,j+2) * (-0.005952380952380952) & - + in(i-1,j+2) * (-0.03571428571428571) & - + in(i+1,j+2) * (0.005952380952380952) & - + in(i+2,j+2) * (0.017857142857142856) & - + in(i+3,j+2) * (0.002380952380952381) & - + in(i+4,j+2) * (0.0012755102040816326) & - + in(i+5,j+2) * (0.0007936507936507937) & - + in(i+6,j+2) * (0.0005411255411255411) & - + in(i+7,j+2) * (0.0003924646781789639) & - + in(i-7,j+3) * (-0.0003924646781789639) & - + in(i-6,j+3) * (-0.0005411255411255411) & - + in(i-5,j+3) * (-0.0007936507936507937) & - + in(i-4,j+3) * (-0.0012755102040816326) & - + in(i-3,j+3) * (-0.002380952380952381) & - + in(i-2,j+3) * (-0.005952380952380952) & - + in(i-1,j+3) * (-0.03571428571428571) & - + in(i+1,j+3) * (0.002380952380952381) & - + in(i+2,j+3) * (0.002380952380952381) & - + in(i+3,j+3) * (0.011904761904761904) & - + in(i+4,j+3) * (0.0012755102040816326) & - + in(i+5,j+3) * (0.0007936507936507937) & - + in(i+6,j+3) * (0.0005411255411255411) & - + in(i+7,j+3) * (0.0003924646781789639) & - + in(i-7,j+4) * (-0.0003924646781789639) & - + in(i-6,j+4) * (-0.0005411255411255411) & - + in(i-5,j+4) * (-0.0007936507936507937) & - + in(i-4,j+4) * (-0.0012755102040816326) & - + in(i-3,j+4) * (-0.002380952380952381) & - + in(i-2,j+4) * (-0.005952380952380952) & - + in(i-1,j+4) * (-0.03571428571428571) & - + in(i+1,j+4) * (0.0012755102040816326) & - + in(i+2,j+4) * (0.0012755102040816326) & - + in(i+3,j+4) * (0.0012755102040816326) & - + in(i+4,j+4) * (0.008928571428571428) & - + in(i+5,j+4) * (0.0007936507936507937) & - + in(i+6,j+4) * (0.0005411255411255411) & - + in(i+7,j+4) * (0.0003924646781789639) & - + in(i-7,j+5) * (-0.0003924646781789639) & - + in(i-6,j+5) * (-0.0005411255411255411) & - + in(i-5,j+5) * (-0.0007936507936507937) & - + in(i-4,j+5) * (-0.0012755102040816326) & - + in(i-3,j+5) * (-0.002380952380952381) & - + in(i-2,j+5) * (-0.005952380952380952) & - + in(i-1,j+5) * (-0.03571428571428571) & - + in(i+1,j+5) * (0.0007936507936507937) & - + in(i+2,j+5) * (0.0007936507936507937) & - + in(i+3,j+5) * (0.0007936507936507937) & - + in(i+4,j+5) * (0.0007936507936507937) & - + in(i+5,j+5) * (0.007142857142857143) & - + in(i+6,j+5) * (0.0005411255411255411) & - + in(i+7,j+5) * (0.0003924646781789639) & - + in(i-7,j+6) * (-0.0003924646781789639) & - + in(i-6,j+6) * (-0.0005411255411255411) & - + in(i-5,j+6) * (-0.0007936507936507937) & - + in(i-4,j+6) * (-0.0012755102040816326) & - + in(i-3,j+6) * (-0.002380952380952381) & - + in(i-2,j+6) * (-0.005952380952380952) & - + in(i-1,j+6) * (-0.03571428571428571) & - + in(i+1,j+6) * (0.0005411255411255411) & - + in(i+2,j+6) * (0.0005411255411255411) & - + in(i+3,j+6) * (0.0005411255411255411) & - + in(i+4,j+6) * (0.0005411255411255411) & - + in(i+5,j+6) * (0.0005411255411255411) & - + in(i+6,j+6) * (0.005952380952380952) & - + in(i+7,j+6) * (0.0003924646781789639) & - + in(i-7,j+7) * (-0.0003924646781789639) & - + in(i-6,j+7) * (-0.0005411255411255411) & - + in(i-5,j+7) * (-0.0007936507936507937) & - + in(i-4,j+7) * (-0.0012755102040816326) & - + in(i-3,j+7) * (-0.002380952380952381) & - + in(i-2,j+7) * (-0.005952380952380952) & - + in(i-1,j+7) * (-0.03571428571428571) & - + in(i+1,j+7) * (0.0003924646781789639) & - + in(i+2,j+7) * (0.0003924646781789639) & - + in(i+3,j+7) * (0.0003924646781789639) & - + in(i+4,j+7) * (0.0003924646781789639) & - + in(i+5,j+7) * (0.0003924646781789639) & - + in(i+6,j+7) * (0.0003924646781789639) & - + in(i+7,j+7) * (0.00510204081632653) & + + in(i-7,j-7) * (-0.00510204081632653d0) & + + in(i+1,j-7) * (-0.0003924646781789639d0) & + + in(i+2,j-7) * (-0.0003924646781789639d0) & + + in(i+3,j-7) * (-0.0003924646781789639d0) & + + in(i+4,j-7) * (-0.0003924646781789639d0) & + + in(i+5,j-7) * (-0.0003924646781789639d0) & + + in(i+6,j-7) * (-0.0003924646781789639d0) & + + in(i+7,j-7) * (-0.0003924646781789639d0) & + + in(i-6,j-6) * (-0.005952380952380952d0) & + + in(i+1,j-6) * (-0.0005411255411255411d0) & + + in(i+2,j-6) * (-0.0005411255411255411d0) & + + in(i+3,j-6) * (-0.0005411255411255411d0) & + + in(i+4,j-6) * (-0.0005411255411255411d0) & + + in(i+5,j-6) * (-0.0005411255411255411d0) & + + in(i+6,j-6) * (-0.0005411255411255411d0) & + + in(i+7,j-6) * (-0.0005411255411255411d0) & + + in(i-5,j-5) * (-0.007142857142857143d0) & + + in(i+1,j-5) * (-0.0007936507936507937d0) & + + in(i+2,j-5) * (-0.0007936507936507937d0) & + + in(i+3,j-5) * (-0.0007936507936507937d0) & + + in(i+4,j-5) * (-0.0007936507936507937d0) & + + in(i+5,j-5) * (-0.0007936507936507937d0) & + + in(i+6,j-5) * (-0.0007936507936507937d0) & + + in(i+7,j-5) * (-0.0007936507936507937d0) & + + in(i-4,j-4) * (-0.008928571428571428d0) & + + in(i+1,j-4) * (-0.0012755102040816326d0) & + + in(i+2,j-4) * (-0.0012755102040816326d0) & + + in(i+3,j-4) * (-0.0012755102040816326d0) & + + in(i+4,j-4) * (-0.0012755102040816326d0) & + + in(i+5,j-4) * (-0.0012755102040816326d0) & + + in(i+6,j-4) * (-0.0012755102040816326d0) & + + in(i+7,j-4) * (-0.0012755102040816326d0) & + + in(i-3,j-3) * (-0.011904761904761904d0) & + + in(i+1,j-3) * (-0.002380952380952381d0) & + + in(i+2,j-3) * (-0.002380952380952381d0) & + + in(i+3,j-3) * (-0.002380952380952381d0) & + + in(i+4,j-3) * (-0.002380952380952381d0) & + + in(i+5,j-3) * (-0.002380952380952381d0) & + + in(i+6,j-3) * (-0.002380952380952381d0) & + + in(i+7,j-3) * (-0.002380952380952381d0) & + + in(i-2,j-2) * (-0.017857142857142856d0) & + + in(i+1,j-2) * (-0.005952380952380952d0) & + + in(i+2,j-2) * (-0.005952380952380952d0) & + + in(i+3,j-2) * (-0.005952380952380952d0) & + + in(i+4,j-2) * (-0.005952380952380952d0) & + + in(i+5,j-2) * (-0.005952380952380952d0) & + + in(i+6,j-2) * (-0.005952380952380952d0) & + + in(i+7,j-2) * (-0.005952380952380952d0) & + + in(i-1,j-1) * (-0.03571428571428571d0) & + + in(i+1,j-1) * (-0.03571428571428571d0) & + + in(i+2,j-1) * (-0.03571428571428571d0) & + + in(i+3,j-1) * (-0.03571428571428571d0) & + + in(i+4,j-1) * (-0.03571428571428571d0) & + + in(i+5,j-1) * (-0.03571428571428571d0) & + + in(i+6,j-1) * (-0.03571428571428571d0) & + + in(i+7,j-1) * (-0.03571428571428571d0) & + + in(i-7,j+1) * (-0.0003924646781789639d0) & + + in(i-6,j+1) * (-0.0005411255411255411d0) & + + in(i-5,j+1) * (-0.0007936507936507937d0) & + + in(i-4,j+1) * (-0.0012755102040816326d0) & + + in(i-3,j+1) * (-0.002380952380952381d0) & + + in(i-2,j+1) * (-0.005952380952380952d0) & + + in(i-1,j+1) * (-0.03571428571428571d0) & + + in(i+1,j+1) * (0.03571428571428571d0) & + + in(i+2,j+1) * (0.005952380952380952d0) & + + in(i+3,j+1) * (0.002380952380952381d0) & + + in(i+4,j+1) * (0.0012755102040816326d0) & + + in(i+5,j+1) * (0.0007936507936507937d0) & + + in(i+6,j+1) * (0.0005411255411255411d0) & + + in(i+7,j+1) * (0.0003924646781789639d0) & + + in(i-7,j+2) * (-0.0003924646781789639d0) & + + in(i-6,j+2) * (-0.0005411255411255411d0) & + + in(i-5,j+2) * (-0.0007936507936507937d0) & + + in(i-4,j+2) * (-0.0012755102040816326d0) & + + in(i-3,j+2) * (-0.002380952380952381d0) & + + in(i-2,j+2) * (-0.005952380952380952d0) & + + in(i-1,j+2) * (-0.03571428571428571d0) & + + in(i+1,j+2) * (0.005952380952380952d0) & + + in(i+2,j+2) * (0.017857142857142856d0) & + + in(i+3,j+2) * (0.002380952380952381d0) & + + in(i+4,j+2) * (0.0012755102040816326d0) & + + in(i+5,j+2) * (0.0007936507936507937d0) & + + in(i+6,j+2) * (0.0005411255411255411d0) & + + in(i+7,j+2) * (0.0003924646781789639d0) & + + in(i-7,j+3) * (-0.0003924646781789639d0) & + + in(i-6,j+3) * (-0.0005411255411255411d0) & + + in(i-5,j+3) * (-0.0007936507936507937d0) & + + in(i-4,j+3) * (-0.0012755102040816326d0) & + + in(i-3,j+3) * (-0.002380952380952381d0) & + + in(i-2,j+3) * (-0.005952380952380952d0) & + + in(i-1,j+3) * (-0.03571428571428571d0) & + + in(i+1,j+3) * (0.002380952380952381d0) & + + in(i+2,j+3) * (0.002380952380952381d0) & + + in(i+3,j+3) * (0.011904761904761904d0) & + + in(i+4,j+3) * (0.0012755102040816326d0) & + + in(i+5,j+3) * (0.0007936507936507937d0) & + + in(i+6,j+3) * (0.0005411255411255411d0) & + + in(i+7,j+3) * (0.0003924646781789639d0) & + + in(i-7,j+4) * (-0.0003924646781789639d0) & + + in(i-6,j+4) * (-0.0005411255411255411d0) & + + in(i-5,j+4) * (-0.0007936507936507937d0) & + + in(i-4,j+4) * (-0.0012755102040816326d0) & + + in(i-3,j+4) * (-0.002380952380952381d0) & + + in(i-2,j+4) * (-0.005952380952380952d0) & + + in(i-1,j+4) * (-0.03571428571428571d0) & + + in(i+1,j+4) * (0.0012755102040816326d0) & + + in(i+2,j+4) * (0.0012755102040816326d0) & + + in(i+3,j+4) * (0.0012755102040816326d0) & + + in(i+4,j+4) * (0.008928571428571428d0) & + + in(i+5,j+4) * (0.0007936507936507937d0) & + + in(i+6,j+4) * (0.0005411255411255411d0) & + + in(i+7,j+4) * (0.0003924646781789639d0) & + + in(i-7,j+5) * (-0.0003924646781789639d0) & + + in(i-6,j+5) * (-0.0005411255411255411d0) & + + in(i-5,j+5) * (-0.0007936507936507937d0) & + + in(i-4,j+5) * (-0.0012755102040816326d0) & + + in(i-3,j+5) * (-0.002380952380952381d0) & + + in(i-2,j+5) * (-0.005952380952380952d0) & + + in(i-1,j+5) * (-0.03571428571428571d0) & + + in(i+1,j+5) * (0.0007936507936507937d0) & + + in(i+2,j+5) * (0.0007936507936507937d0) & + + in(i+3,j+5) * (0.0007936507936507937d0) & + + in(i+4,j+5) * (0.0007936507936507937d0) & + + in(i+5,j+5) * (0.007142857142857143d0) & + + in(i+6,j+5) * (0.0005411255411255411d0) & + + in(i+7,j+5) * (0.0003924646781789639d0) & + + in(i-7,j+6) * (-0.0003924646781789639d0) & + + in(i-6,j+6) * (-0.0005411255411255411d0) & + + in(i-5,j+6) * (-0.0007936507936507937d0) & + + in(i-4,j+6) * (-0.0012755102040816326d0) & + + in(i-3,j+6) * (-0.002380952380952381d0) & + + in(i-2,j+6) * (-0.005952380952380952d0) & + + in(i-1,j+6) * (-0.03571428571428571d0) & + + in(i+1,j+6) * (0.0005411255411255411d0) & + + in(i+2,j+6) * (0.0005411255411255411d0) & + + in(i+3,j+6) * (0.0005411255411255411d0) & + + in(i+4,j+6) * (0.0005411255411255411d0) & + + in(i+5,j+6) * (0.0005411255411255411d0) & + + in(i+6,j+6) * (0.005952380952380952d0) & + + in(i+7,j+6) * (0.0003924646781789639d0) & + + in(i-7,j+7) * (-0.0003924646781789639d0) & + + in(i-6,j+7) * (-0.0005411255411255411d0) & + + in(i-5,j+7) * (-0.0007936507936507937d0) & + + in(i-4,j+7) * (-0.0012755102040816326d0) & + + in(i-3,j+7) * (-0.002380952380952381d0) & + + in(i-2,j+7) * (-0.005952380952380952d0) & + + in(i-1,j+7) * (-0.03571428571428571d0) & + + in(i+1,j+7) * (0.0003924646781789639d0) & + + in(i+2,j+7) * (0.0003924646781789639d0) & + + in(i+3,j+7) * (0.0003924646781789639d0) & + + in(i+4,j+7) * (0.0003924646781789639d0) & + + in(i+5,j+7) * (0.0003924646781789639d0) & + + in(i+6,j+7) * (0.0003924646781789639d0) & + + in(i+7,j+7) * (0.00510204081632653d0) & +0.0 end do end do @@ -876,206 +876,206 @@ subroutine grid8(n, in, out) do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i-8,j-8) * (-0.00390625) & - + in(i+1,j-8) * (-0.00026041666666666666) & - + in(i+2,j-8) * (-0.00026041666666666666) & - + in(i+3,j-8) * (-0.00026041666666666666) & - + in(i+4,j-8) * (-0.00026041666666666666) & - + in(i+5,j-8) * (-0.00026041666666666666) & - + in(i+6,j-8) * (-0.00026041666666666666) & - + in(i+7,j-8) * (-0.00026041666666666666) & - + in(i+8,j-8) * (-0.00026041666666666666) & - + in(i-7,j-7) * (-0.004464285714285714) & - + in(i+1,j-7) * (-0.00034340659340659343) & - + in(i+2,j-7) * (-0.00034340659340659343) & - + in(i+3,j-7) * (-0.00034340659340659343) & - + in(i+4,j-7) * (-0.00034340659340659343) & - + in(i+5,j-7) * (-0.00034340659340659343) & - + in(i+6,j-7) * (-0.00034340659340659343) & - + in(i+7,j-7) * (-0.00034340659340659343) & - + in(i+8,j-7) * (-0.00034340659340659343) & - + in(i-6,j-6) * (-0.005208333333333333) & - + in(i+1,j-6) * (-0.0004734848484848485) & - + in(i+2,j-6) * (-0.0004734848484848485) & - + in(i+3,j-6) * (-0.0004734848484848485) & - + in(i+4,j-6) * (-0.0004734848484848485) & - + in(i+5,j-6) * (-0.0004734848484848485) & - + in(i+6,j-6) * (-0.0004734848484848485) & - + in(i+7,j-6) * (-0.0004734848484848485) & - + in(i+8,j-6) * (-0.0004734848484848485) & - + in(i-5,j-5) * (-0.00625) & - + in(i+1,j-5) * (-0.0006944444444444445) & - + in(i+2,j-5) * (-0.0006944444444444445) & - + in(i+3,j-5) * (-0.0006944444444444445) & - + in(i+4,j-5) * (-0.0006944444444444445) & - + in(i+5,j-5) * (-0.0006944444444444445) & - + in(i+6,j-5) * (-0.0006944444444444445) & - + in(i+7,j-5) * (-0.0006944444444444445) & - + in(i+8,j-5) * (-0.0006944444444444445) & - + in(i-4,j-4) * (-0.0078125) & - + in(i+1,j-4) * (-0.0011160714285714285) & - + in(i+2,j-4) * (-0.0011160714285714285) & - + in(i+3,j-4) * (-0.0011160714285714285) & - + in(i+4,j-4) * (-0.0011160714285714285) & - + in(i+5,j-4) * (-0.0011160714285714285) & - + in(i+6,j-4) * (-0.0011160714285714285) & - + in(i+7,j-4) * (-0.0011160714285714285) & - + in(i+8,j-4) * (-0.0011160714285714285) & - + in(i-3,j-3) * (-0.010416666666666666) & - + in(i+1,j-3) * (-0.0020833333333333333) & - + in(i+2,j-3) * (-0.0020833333333333333) & - + in(i+3,j-3) * (-0.0020833333333333333) & - + in(i+4,j-3) * (-0.0020833333333333333) & - + in(i+5,j-3) * (-0.0020833333333333333) & - + in(i+6,j-3) * (-0.0020833333333333333) & - + in(i+7,j-3) * (-0.0020833333333333333) & - + in(i+8,j-3) * (-0.0020833333333333333) & - + in(i-2,j-2) * (-0.015625) & - + in(i+1,j-2) * (-0.005208333333333333) & - + in(i+2,j-2) * (-0.005208333333333333) & - + in(i+3,j-2) * (-0.005208333333333333) & - + in(i+4,j-2) * (-0.005208333333333333) & - + in(i+5,j-2) * (-0.005208333333333333) & - + in(i+6,j-2) * (-0.005208333333333333) & - + in(i+7,j-2) * (-0.005208333333333333) & - + in(i+8,j-2) * (-0.005208333333333333) & - + in(i-1,j-1) * (-0.03125) & - + in(i+1,j-1) * (-0.03125) & - + in(i+2,j-1) * (-0.03125) & - + in(i+3,j-1) * (-0.03125) & - + in(i+4,j-1) * (-0.03125) & - + in(i+5,j-1) * (-0.03125) & - + in(i+6,j-1) * (-0.03125) & - + in(i+7,j-1) * (-0.03125) & - + in(i+8,j-1) * (-0.03125) & - + in(i-8,j+1) * (-0.00026041666666666666) & - + in(i-7,j+1) * (-0.00034340659340659343) & - + in(i-6,j+1) * (-0.0004734848484848485) & - + in(i-5,j+1) * (-0.0006944444444444445) & - + in(i-4,j+1) * (-0.0011160714285714285) & - + in(i-3,j+1) * (-0.0020833333333333333) & - + in(i-2,j+1) * (-0.005208333333333333) & - + in(i-1,j+1) * (-0.03125) & - + in(i+1,j+1) * (0.03125) & - + in(i+2,j+1) * (0.005208333333333333) & - + in(i+3,j+1) * (0.0020833333333333333) & - + in(i+4,j+1) * (0.0011160714285714285) & - + in(i+5,j+1) * (0.0006944444444444445) & - + in(i+6,j+1) * (0.0004734848484848485) & - + in(i+7,j+1) * (0.00034340659340659343) & - + in(i+8,j+1) * (0.00026041666666666666) & - + in(i-8,j+2) * (-0.00026041666666666666) & - + in(i-7,j+2) * (-0.00034340659340659343) & - + in(i-6,j+2) * (-0.0004734848484848485) & - + in(i-5,j+2) * (-0.0006944444444444445) & - + in(i-4,j+2) * (-0.0011160714285714285) & - + in(i-3,j+2) * (-0.0020833333333333333) & - + in(i-2,j+2) * (-0.005208333333333333) & - + in(i-1,j+2) * (-0.03125) & - + in(i+1,j+2) * (0.005208333333333333) & - + in(i+2,j+2) * (0.015625) & - + in(i+3,j+2) * (0.0020833333333333333) & - + in(i+4,j+2) * (0.0011160714285714285) & - + in(i+5,j+2) * (0.0006944444444444445) & - + in(i+6,j+2) * (0.0004734848484848485) & - + in(i+7,j+2) * (0.00034340659340659343) & - + in(i+8,j+2) * (0.00026041666666666666) & - + in(i-8,j+3) * (-0.00026041666666666666) & - + in(i-7,j+3) * (-0.00034340659340659343) & - + in(i-6,j+3) * (-0.0004734848484848485) & - + in(i-5,j+3) * (-0.0006944444444444445) & - + in(i-4,j+3) * (-0.0011160714285714285) & - + in(i-3,j+3) * (-0.0020833333333333333) & - + in(i-2,j+3) * (-0.005208333333333333) & - + in(i-1,j+3) * (-0.03125) & - + in(i+1,j+3) * (0.0020833333333333333) & - + in(i+2,j+3) * (0.0020833333333333333) & - + in(i+3,j+3) * (0.010416666666666666) & - + in(i+4,j+3) * (0.0011160714285714285) & - + in(i+5,j+3) * (0.0006944444444444445) & - + in(i+6,j+3) * (0.0004734848484848485) & - + in(i+7,j+3) * (0.00034340659340659343) & - + in(i+8,j+3) * (0.00026041666666666666) & - + in(i-8,j+4) * (-0.00026041666666666666) & - + in(i-7,j+4) * (-0.00034340659340659343) & - + in(i-6,j+4) * (-0.0004734848484848485) & - + in(i-5,j+4) * (-0.0006944444444444445) & - + in(i-4,j+4) * (-0.0011160714285714285) & - + in(i-3,j+4) * (-0.0020833333333333333) & - + in(i-2,j+4) * (-0.005208333333333333) & - + in(i-1,j+4) * (-0.03125) & - + in(i+1,j+4) * (0.0011160714285714285) & - + in(i+2,j+4) * (0.0011160714285714285) & - + in(i+3,j+4) * (0.0011160714285714285) & - + in(i+4,j+4) * (0.0078125) & - + in(i+5,j+4) * (0.0006944444444444445) & - + in(i+6,j+4) * (0.0004734848484848485) & - + in(i+7,j+4) * (0.00034340659340659343) & - + in(i+8,j+4) * (0.00026041666666666666) & - + in(i-8,j+5) * (-0.00026041666666666666) & - + in(i-7,j+5) * (-0.00034340659340659343) & - + in(i-6,j+5) * (-0.0004734848484848485) & - + in(i-5,j+5) * (-0.0006944444444444445) & - + in(i-4,j+5) * (-0.0011160714285714285) & - + in(i-3,j+5) * (-0.0020833333333333333) & - + in(i-2,j+5) * (-0.005208333333333333) & - + in(i-1,j+5) * (-0.03125) & - + in(i+1,j+5) * (0.0006944444444444445) & - + in(i+2,j+5) * (0.0006944444444444445) & - + in(i+3,j+5) * (0.0006944444444444445) & - + in(i+4,j+5) * (0.0006944444444444445) & - + in(i+5,j+5) * (0.00625) & - + in(i+6,j+5) * (0.0004734848484848485) & - + in(i+7,j+5) * (0.00034340659340659343) & - + in(i+8,j+5) * (0.00026041666666666666) & - + in(i-8,j+6) * (-0.00026041666666666666) & - + in(i-7,j+6) * (-0.00034340659340659343) & - + in(i-6,j+6) * (-0.0004734848484848485) & - + in(i-5,j+6) * (-0.0006944444444444445) & - + in(i-4,j+6) * (-0.0011160714285714285) & - + in(i-3,j+6) * (-0.0020833333333333333) & - + in(i-2,j+6) * (-0.005208333333333333) & - + in(i-1,j+6) * (-0.03125) & - + in(i+1,j+6) * (0.0004734848484848485) & - + in(i+2,j+6) * (0.0004734848484848485) & - + in(i+3,j+6) * (0.0004734848484848485) & - + in(i+4,j+6) * (0.0004734848484848485) & - + in(i+5,j+6) * (0.0004734848484848485) & - + in(i+6,j+6) * (0.005208333333333333) & - + in(i+7,j+6) * (0.00034340659340659343) & - + in(i+8,j+6) * (0.00026041666666666666) & - + in(i-8,j+7) * (-0.00026041666666666666) & - + in(i-7,j+7) * (-0.00034340659340659343) & - + in(i-6,j+7) * (-0.0004734848484848485) & - + in(i-5,j+7) * (-0.0006944444444444445) & - + in(i-4,j+7) * (-0.0011160714285714285) & - + in(i-3,j+7) * (-0.0020833333333333333) & - + in(i-2,j+7) * (-0.005208333333333333) & - + in(i-1,j+7) * (-0.03125) & - + in(i+1,j+7) * (0.00034340659340659343) & - + in(i+2,j+7) * (0.00034340659340659343) & - + in(i+3,j+7) * (0.00034340659340659343) & - + in(i+4,j+7) * (0.00034340659340659343) & - + in(i+5,j+7) * (0.00034340659340659343) & - + in(i+6,j+7) * (0.00034340659340659343) & - + in(i+7,j+7) * (0.004464285714285714) & - + in(i+8,j+7) * (0.00026041666666666666) & - + in(i-8,j+8) * (-0.00026041666666666666) & - + in(i-7,j+8) * (-0.00034340659340659343) & - + in(i-6,j+8) * (-0.0004734848484848485) & - + in(i-5,j+8) * (-0.0006944444444444445) & - + in(i-4,j+8) * (-0.0011160714285714285) & - + in(i-3,j+8) * (-0.0020833333333333333) & - + in(i-2,j+8) * (-0.005208333333333333) & - + in(i-1,j+8) * (-0.03125) & - + in(i+1,j+8) * (0.00026041666666666666) & - + in(i+2,j+8) * (0.00026041666666666666) & - + in(i+3,j+8) * (0.00026041666666666666) & - + in(i+4,j+8) * (0.00026041666666666666) & - + in(i+5,j+8) * (0.00026041666666666666) & - + in(i+6,j+8) * (0.00026041666666666666) & - + in(i+7,j+8) * (0.00026041666666666666) & - + in(i+8,j+8) * (0.00390625) & + + in(i-8,j-8) * (-0.00390625d0) & + + in(i+1,j-8) * (-0.00026041666666666666d0) & + + in(i+2,j-8) * (-0.00026041666666666666d0) & + + in(i+3,j-8) * (-0.00026041666666666666d0) & + + in(i+4,j-8) * (-0.00026041666666666666d0) & + + in(i+5,j-8) * (-0.00026041666666666666d0) & + + in(i+6,j-8) * (-0.00026041666666666666d0) & + + in(i+7,j-8) * (-0.00026041666666666666d0) & + + in(i+8,j-8) * (-0.00026041666666666666d0) & + + in(i-7,j-7) * (-0.004464285714285714d0) & + + in(i+1,j-7) * (-0.00034340659340659343d0) & + + in(i+2,j-7) * (-0.00034340659340659343d0) & + + in(i+3,j-7) * (-0.00034340659340659343d0) & + + in(i+4,j-7) * (-0.00034340659340659343d0) & + + in(i+5,j-7) * (-0.00034340659340659343d0) & + + in(i+6,j-7) * (-0.00034340659340659343d0) & + + in(i+7,j-7) * (-0.00034340659340659343d0) & + + in(i+8,j-7) * (-0.00034340659340659343d0) & + + in(i-6,j-6) * (-0.005208333333333333d0) & + + in(i+1,j-6) * (-0.0004734848484848485d0) & + + in(i+2,j-6) * (-0.0004734848484848485d0) & + + in(i+3,j-6) * (-0.0004734848484848485d0) & + + in(i+4,j-6) * (-0.0004734848484848485d0) & + + in(i+5,j-6) * (-0.0004734848484848485d0) & + + in(i+6,j-6) * (-0.0004734848484848485d0) & + + in(i+7,j-6) * (-0.0004734848484848485d0) & + + in(i+8,j-6) * (-0.0004734848484848485d0) & + + in(i-5,j-5) * (-0.00625d0) & + + in(i+1,j-5) * (-0.0006944444444444445d0) & + + in(i+2,j-5) * (-0.0006944444444444445d0) & + + in(i+3,j-5) * (-0.0006944444444444445d0) & + + in(i+4,j-5) * (-0.0006944444444444445d0) & + + in(i+5,j-5) * (-0.0006944444444444445d0) & + + in(i+6,j-5) * (-0.0006944444444444445d0) & + + in(i+7,j-5) * (-0.0006944444444444445d0) & + + in(i+8,j-5) * (-0.0006944444444444445d0) & + + in(i-4,j-4) * (-0.0078125d0) & + + in(i+1,j-4) * (-0.0011160714285714285d0) & + + in(i+2,j-4) * (-0.0011160714285714285d0) & + + in(i+3,j-4) * (-0.0011160714285714285d0) & + + in(i+4,j-4) * (-0.0011160714285714285d0) & + + in(i+5,j-4) * (-0.0011160714285714285d0) & + + in(i+6,j-4) * (-0.0011160714285714285d0) & + + in(i+7,j-4) * (-0.0011160714285714285d0) & + + in(i+8,j-4) * (-0.0011160714285714285d0) & + + in(i-3,j-3) * (-0.010416666666666666d0) & + + in(i+1,j-3) * (-0.0020833333333333333d0) & + + in(i+2,j-3) * (-0.0020833333333333333d0) & + + in(i+3,j-3) * (-0.0020833333333333333d0) & + + in(i+4,j-3) * (-0.0020833333333333333d0) & + + in(i+5,j-3) * (-0.0020833333333333333d0) & + + in(i+6,j-3) * (-0.0020833333333333333d0) & + + in(i+7,j-3) * (-0.0020833333333333333d0) & + + in(i+8,j-3) * (-0.0020833333333333333d0) & + + in(i-2,j-2) * (-0.015625d0) & + + in(i+1,j-2) * (-0.005208333333333333d0) & + + in(i+2,j-2) * (-0.005208333333333333d0) & + + in(i+3,j-2) * (-0.005208333333333333d0) & + + in(i+4,j-2) * (-0.005208333333333333d0) & + + in(i+5,j-2) * (-0.005208333333333333d0) & + + in(i+6,j-2) * (-0.005208333333333333d0) & + + in(i+7,j-2) * (-0.005208333333333333d0) & + + in(i+8,j-2) * (-0.005208333333333333d0) & + + in(i-1,j-1) * (-0.03125d0) & + + in(i+1,j-1) * (-0.03125d0) & + + in(i+2,j-1) * (-0.03125d0) & + + in(i+3,j-1) * (-0.03125d0) & + + in(i+4,j-1) * (-0.03125d0) & + + in(i+5,j-1) * (-0.03125d0) & + + in(i+6,j-1) * (-0.03125d0) & + + in(i+7,j-1) * (-0.03125d0) & + + in(i+8,j-1) * (-0.03125d0) & + + in(i-8,j+1) * (-0.00026041666666666666d0) & + + in(i-7,j+1) * (-0.00034340659340659343d0) & + + in(i-6,j+1) * (-0.0004734848484848485d0) & + + in(i-5,j+1) * (-0.0006944444444444445d0) & + + in(i-4,j+1) * (-0.0011160714285714285d0) & + + in(i-3,j+1) * (-0.0020833333333333333d0) & + + in(i-2,j+1) * (-0.005208333333333333d0) & + + in(i-1,j+1) * (-0.03125d0) & + + in(i+1,j+1) * (0.03125d0) & + + in(i+2,j+1) * (0.005208333333333333d0) & + + in(i+3,j+1) * (0.0020833333333333333d0) & + + in(i+4,j+1) * (0.0011160714285714285d0) & + + in(i+5,j+1) * (0.0006944444444444445d0) & + + in(i+6,j+1) * (0.0004734848484848485d0) & + + in(i+7,j+1) * (0.00034340659340659343d0) & + + in(i+8,j+1) * (0.00026041666666666666d0) & + + in(i-8,j+2) * (-0.00026041666666666666d0) & + + in(i-7,j+2) * (-0.00034340659340659343d0) & + + in(i-6,j+2) * (-0.0004734848484848485d0) & + + in(i-5,j+2) * (-0.0006944444444444445d0) & + + in(i-4,j+2) * (-0.0011160714285714285d0) & + + in(i-3,j+2) * (-0.0020833333333333333d0) & + + in(i-2,j+2) * (-0.005208333333333333d0) & + + in(i-1,j+2) * (-0.03125d0) & + + in(i+1,j+2) * (0.005208333333333333d0) & + + in(i+2,j+2) * (0.015625d0) & + + in(i+3,j+2) * (0.0020833333333333333d0) & + + in(i+4,j+2) * (0.0011160714285714285d0) & + + in(i+5,j+2) * (0.0006944444444444445d0) & + + in(i+6,j+2) * (0.0004734848484848485d0) & + + in(i+7,j+2) * (0.00034340659340659343d0) & + + in(i+8,j+2) * (0.00026041666666666666d0) & + + in(i-8,j+3) * (-0.00026041666666666666d0) & + + in(i-7,j+3) * (-0.00034340659340659343d0) & + + in(i-6,j+3) * (-0.0004734848484848485d0) & + + in(i-5,j+3) * (-0.0006944444444444445d0) & + + in(i-4,j+3) * (-0.0011160714285714285d0) & + + in(i-3,j+3) * (-0.0020833333333333333d0) & + + in(i-2,j+3) * (-0.005208333333333333d0) & + + in(i-1,j+3) * (-0.03125d0) & + + in(i+1,j+3) * (0.0020833333333333333d0) & + + in(i+2,j+3) * (0.0020833333333333333d0) & + + in(i+3,j+3) * (0.010416666666666666d0) & + + in(i+4,j+3) * (0.0011160714285714285d0) & + + in(i+5,j+3) * (0.0006944444444444445d0) & + + in(i+6,j+3) * (0.0004734848484848485d0) & + + in(i+7,j+3) * (0.00034340659340659343d0) & + + in(i+8,j+3) * (0.00026041666666666666d0) & + + in(i-8,j+4) * (-0.00026041666666666666d0) & + + in(i-7,j+4) * (-0.00034340659340659343d0) & + + in(i-6,j+4) * (-0.0004734848484848485d0) & + + in(i-5,j+4) * (-0.0006944444444444445d0) & + + in(i-4,j+4) * (-0.0011160714285714285d0) & + + in(i-3,j+4) * (-0.0020833333333333333d0) & + + in(i-2,j+4) * (-0.005208333333333333d0) & + + in(i-1,j+4) * (-0.03125d0) & + + in(i+1,j+4) * (0.0011160714285714285d0) & + + in(i+2,j+4) * (0.0011160714285714285d0) & + + in(i+3,j+4) * (0.0011160714285714285d0) & + + in(i+4,j+4) * (0.0078125d0) & + + in(i+5,j+4) * (0.0006944444444444445d0) & + + in(i+6,j+4) * (0.0004734848484848485d0) & + + in(i+7,j+4) * (0.00034340659340659343d0) & + + in(i+8,j+4) * (0.00026041666666666666d0) & + + in(i-8,j+5) * (-0.00026041666666666666d0) & + + in(i-7,j+5) * (-0.00034340659340659343d0) & + + in(i-6,j+5) * (-0.0004734848484848485d0) & + + in(i-5,j+5) * (-0.0006944444444444445d0) & + + in(i-4,j+5) * (-0.0011160714285714285d0) & + + in(i-3,j+5) * (-0.0020833333333333333d0) & + + in(i-2,j+5) * (-0.005208333333333333d0) & + + in(i-1,j+5) * (-0.03125d0) & + + in(i+1,j+5) * (0.0006944444444444445d0) & + + in(i+2,j+5) * (0.0006944444444444445d0) & + + in(i+3,j+5) * (0.0006944444444444445d0) & + + in(i+4,j+5) * (0.0006944444444444445d0) & + + in(i+5,j+5) * (0.00625d0) & + + in(i+6,j+5) * (0.0004734848484848485d0) & + + in(i+7,j+5) * (0.00034340659340659343d0) & + + in(i+8,j+5) * (0.00026041666666666666d0) & + + in(i-8,j+6) * (-0.00026041666666666666d0) & + + in(i-7,j+6) * (-0.00034340659340659343d0) & + + in(i-6,j+6) * (-0.0004734848484848485d0) & + + in(i-5,j+6) * (-0.0006944444444444445d0) & + + in(i-4,j+6) * (-0.0011160714285714285d0) & + + in(i-3,j+6) * (-0.0020833333333333333d0) & + + in(i-2,j+6) * (-0.005208333333333333d0) & + + in(i-1,j+6) * (-0.03125d0) & + + in(i+1,j+6) * (0.0004734848484848485d0) & + + in(i+2,j+6) * (0.0004734848484848485d0) & + + in(i+3,j+6) * (0.0004734848484848485d0) & + + in(i+4,j+6) * (0.0004734848484848485d0) & + + in(i+5,j+6) * (0.0004734848484848485d0) & + + in(i+6,j+6) * (0.005208333333333333d0) & + + in(i+7,j+6) * (0.00034340659340659343d0) & + + in(i+8,j+6) * (0.00026041666666666666d0) & + + in(i-8,j+7) * (-0.00026041666666666666d0) & + + in(i-7,j+7) * (-0.00034340659340659343d0) & + + in(i-6,j+7) * (-0.0004734848484848485d0) & + + in(i-5,j+7) * (-0.0006944444444444445d0) & + + in(i-4,j+7) * (-0.0011160714285714285d0) & + + in(i-3,j+7) * (-0.0020833333333333333d0) & + + in(i-2,j+7) * (-0.005208333333333333d0) & + + in(i-1,j+7) * (-0.03125d0) & + + in(i+1,j+7) * (0.00034340659340659343d0) & + + in(i+2,j+7) * (0.00034340659340659343d0) & + + in(i+3,j+7) * (0.00034340659340659343d0) & + + in(i+4,j+7) * (0.00034340659340659343d0) & + + in(i+5,j+7) * (0.00034340659340659343d0) & + + in(i+6,j+7) * (0.00034340659340659343d0) & + + in(i+7,j+7) * (0.004464285714285714d0) & + + in(i+8,j+7) * (0.00026041666666666666d0) & + + in(i-8,j+8) * (-0.00026041666666666666d0) & + + in(i-7,j+8) * (-0.00034340659340659343d0) & + + in(i-6,j+8) * (-0.0004734848484848485d0) & + + in(i-5,j+8) * (-0.0006944444444444445d0) & + + in(i-4,j+8) * (-0.0011160714285714285d0) & + + in(i-3,j+8) * (-0.0020833333333333333d0) & + + in(i-2,j+8) * (-0.005208333333333333d0) & + + in(i-1,j+8) * (-0.03125d0) & + + in(i+1,j+8) * (0.00026041666666666666d0) & + + in(i+2,j+8) * (0.00026041666666666666d0) & + + in(i+3,j+8) * (0.00026041666666666666d0) & + + in(i+4,j+8) * (0.00026041666666666666d0) & + + in(i+5,j+8) * (0.00026041666666666666d0) & + + in(i+6,j+8) * (0.00026041666666666666d0) & + + in(i+7,j+8) * (0.00026041666666666666d0) & + + in(i+8,j+8) * (0.00390625d0) & +0.0 end do end do @@ -1091,258 +1091,258 @@ subroutine grid9(n, in, out) do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i-9,j-9) * (-0.0030864197530864196) & - + in(i+1,j-9) * (-0.00018155410312273057) & - + in(i+2,j-9) * (-0.00018155410312273057) & - + in(i+3,j-9) * (-0.00018155410312273057) & - + in(i+4,j-9) * (-0.00018155410312273057) & - + in(i+5,j-9) * (-0.00018155410312273057) & - + in(i+6,j-9) * (-0.00018155410312273057) & - + in(i+7,j-9) * (-0.00018155410312273057) & - + in(i+8,j-9) * (-0.00018155410312273057) & - + in(i+9,j-9) * (-0.00018155410312273057) & - + in(i-8,j-8) * (-0.003472222222222222) & - + in(i+1,j-8) * (-0.0002314814814814815) & - + in(i+2,j-8) * (-0.0002314814814814815) & - + in(i+3,j-8) * (-0.0002314814814814815) & - + in(i+4,j-8) * (-0.0002314814814814815) & - + in(i+5,j-8) * (-0.0002314814814814815) & - + in(i+6,j-8) * (-0.0002314814814814815) & - + in(i+7,j-8) * (-0.0002314814814814815) & - + in(i+8,j-8) * (-0.0002314814814814815) & - + in(i+9,j-8) * (-0.0002314814814814815) & - + in(i-7,j-7) * (-0.003968253968253968) & - + in(i+1,j-7) * (-0.00030525030525030525) & - + in(i+2,j-7) * (-0.00030525030525030525) & - + in(i+3,j-7) * (-0.00030525030525030525) & - + in(i+4,j-7) * (-0.00030525030525030525) & - + in(i+5,j-7) * (-0.00030525030525030525) & - + in(i+6,j-7) * (-0.00030525030525030525) & - + in(i+7,j-7) * (-0.00030525030525030525) & - + in(i+8,j-7) * (-0.00030525030525030525) & - + in(i+9,j-7) * (-0.00030525030525030525) & - + in(i-6,j-6) * (-0.004629629629629629) & - + in(i+1,j-6) * (-0.00042087542087542086) & - + in(i+2,j-6) * (-0.00042087542087542086) & - + in(i+3,j-6) * (-0.00042087542087542086) & - + in(i+4,j-6) * (-0.00042087542087542086) & - + in(i+5,j-6) * (-0.00042087542087542086) & - + in(i+6,j-6) * (-0.00042087542087542086) & - + in(i+7,j-6) * (-0.00042087542087542086) & - + in(i+8,j-6) * (-0.00042087542087542086) & - + in(i+9,j-6) * (-0.00042087542087542086) & - + in(i-5,j-5) * (-0.005555555555555556) & - + in(i+1,j-5) * (-0.0006172839506172839) & - + in(i+2,j-5) * (-0.0006172839506172839) & - + in(i+3,j-5) * (-0.0006172839506172839) & - + in(i+4,j-5) * (-0.0006172839506172839) & - + in(i+5,j-5) * (-0.0006172839506172839) & - + in(i+6,j-5) * (-0.0006172839506172839) & - + in(i+7,j-5) * (-0.0006172839506172839) & - + in(i+8,j-5) * (-0.0006172839506172839) & - + in(i+9,j-5) * (-0.0006172839506172839) & - + in(i-4,j-4) * (-0.006944444444444444) & - + in(i+1,j-4) * (-0.000992063492063492) & - + in(i+2,j-4) * (-0.000992063492063492) & - + in(i+3,j-4) * (-0.000992063492063492) & - + in(i+4,j-4) * (-0.000992063492063492) & - + in(i+5,j-4) * (-0.000992063492063492) & - + in(i+6,j-4) * (-0.000992063492063492) & - + in(i+7,j-4) * (-0.000992063492063492) & - + in(i+8,j-4) * (-0.000992063492063492) & - + in(i+9,j-4) * (-0.000992063492063492) & - + in(i-3,j-3) * (-0.009259259259259259) & - + in(i+1,j-3) * (-0.001851851851851852) & - + in(i+2,j-3) * (-0.001851851851851852) & - + in(i+3,j-3) * (-0.001851851851851852) & - + in(i+4,j-3) * (-0.001851851851851852) & - + in(i+5,j-3) * (-0.001851851851851852) & - + in(i+6,j-3) * (-0.001851851851851852) & - + in(i+7,j-3) * (-0.001851851851851852) & - + in(i+8,j-3) * (-0.001851851851851852) & - + in(i+9,j-3) * (-0.001851851851851852) & - + in(i-2,j-2) * (-0.013888888888888888) & - + in(i+1,j-2) * (-0.004629629629629629) & - + in(i+2,j-2) * (-0.004629629629629629) & - + in(i+3,j-2) * (-0.004629629629629629) & - + in(i+4,j-2) * (-0.004629629629629629) & - + in(i+5,j-2) * (-0.004629629629629629) & - + in(i+6,j-2) * (-0.004629629629629629) & - + in(i+7,j-2) * (-0.004629629629629629) & - + in(i+8,j-2) * (-0.004629629629629629) & - + in(i+9,j-2) * (-0.004629629629629629) & - + in(i-1,j-1) * (-0.027777777777777776) & - + in(i+1,j-1) * (-0.027777777777777776) & - + in(i+2,j-1) * (-0.027777777777777776) & - + in(i+3,j-1) * (-0.027777777777777776) & - + in(i+4,j-1) * (-0.027777777777777776) & - + in(i+5,j-1) * (-0.027777777777777776) & - + in(i+6,j-1) * (-0.027777777777777776) & - + in(i+7,j-1) * (-0.027777777777777776) & - + in(i+8,j-1) * (-0.027777777777777776) & - + in(i+9,j-1) * (-0.027777777777777776) & - + in(i-9,j+1) * (-0.00018155410312273057) & - + in(i-8,j+1) * (-0.0002314814814814815) & - + in(i-7,j+1) * (-0.00030525030525030525) & - + in(i-6,j+1) * (-0.00042087542087542086) & - + in(i-5,j+1) * (-0.0006172839506172839) & - + in(i-4,j+1) * (-0.000992063492063492) & - + in(i-3,j+1) * (-0.001851851851851852) & - + in(i-2,j+1) * (-0.004629629629629629) & - + in(i-1,j+1) * (-0.027777777777777776) & - + in(i+1,j+1) * (0.027777777777777776) & - + in(i+2,j+1) * (0.004629629629629629) & - + in(i+3,j+1) * (0.001851851851851852) & - + in(i+4,j+1) * (0.000992063492063492) & - + in(i+5,j+1) * (0.0006172839506172839) & - + in(i+6,j+1) * (0.00042087542087542086) & - + in(i+7,j+1) * (0.00030525030525030525) & - + in(i+8,j+1) * (0.0002314814814814815) & - + in(i+9,j+1) * (0.00018155410312273057) & - + in(i-9,j+2) * (-0.00018155410312273057) & - + in(i-8,j+2) * (-0.0002314814814814815) & - + in(i-7,j+2) * (-0.00030525030525030525) & - + in(i-6,j+2) * (-0.00042087542087542086) & - + in(i-5,j+2) * (-0.0006172839506172839) & - + in(i-4,j+2) * (-0.000992063492063492) & - + in(i-3,j+2) * (-0.001851851851851852) & - + in(i-2,j+2) * (-0.004629629629629629) & - + in(i-1,j+2) * (-0.027777777777777776) & - + in(i+1,j+2) * (0.004629629629629629) & - + in(i+2,j+2) * (0.013888888888888888) & - + in(i+3,j+2) * (0.001851851851851852) & - + in(i+4,j+2) * (0.000992063492063492) & - + in(i+5,j+2) * (0.0006172839506172839) & - + in(i+6,j+2) * (0.00042087542087542086) & - + in(i+7,j+2) * (0.00030525030525030525) & - + in(i+8,j+2) * (0.0002314814814814815) & - + in(i+9,j+2) * (0.00018155410312273057) & - + in(i-9,j+3) * (-0.00018155410312273057) & - + in(i-8,j+3) * (-0.0002314814814814815) & - + in(i-7,j+3) * (-0.00030525030525030525) & - + in(i-6,j+3) * (-0.00042087542087542086) & - + in(i-5,j+3) * (-0.0006172839506172839) & - + in(i-4,j+3) * (-0.000992063492063492) & - + in(i-3,j+3) * (-0.001851851851851852) & - + in(i-2,j+3) * (-0.004629629629629629) & - + in(i-1,j+3) * (-0.027777777777777776) & - + in(i+1,j+3) * (0.001851851851851852) & - + in(i+2,j+3) * (0.001851851851851852) & - + in(i+3,j+3) * (0.009259259259259259) & - + in(i+4,j+3) * (0.000992063492063492) & - + in(i+5,j+3) * (0.0006172839506172839) & - + in(i+6,j+3) * (0.00042087542087542086) & - + in(i+7,j+3) * (0.00030525030525030525) & - + in(i+8,j+3) * (0.0002314814814814815) & - + in(i+9,j+3) * (0.00018155410312273057) & - + in(i-9,j+4) * (-0.00018155410312273057) & - + in(i-8,j+4) * (-0.0002314814814814815) & - + in(i-7,j+4) * (-0.00030525030525030525) & - + in(i-6,j+4) * (-0.00042087542087542086) & - + in(i-5,j+4) * (-0.0006172839506172839) & - + in(i-4,j+4) * (-0.000992063492063492) & - + in(i-3,j+4) * (-0.001851851851851852) & - + in(i-2,j+4) * (-0.004629629629629629) & - + in(i-1,j+4) * (-0.027777777777777776) & - + in(i+1,j+4) * (0.000992063492063492) & - + in(i+2,j+4) * (0.000992063492063492) & - + in(i+3,j+4) * (0.000992063492063492) & - + in(i+4,j+4) * (0.006944444444444444) & - + in(i+5,j+4) * (0.0006172839506172839) & - + in(i+6,j+4) * (0.00042087542087542086) & - + in(i+7,j+4) * (0.00030525030525030525) & - + in(i+8,j+4) * (0.0002314814814814815) & - + in(i+9,j+4) * (0.00018155410312273057) & - + in(i-9,j+5) * (-0.00018155410312273057) & - + in(i-8,j+5) * (-0.0002314814814814815) & - + in(i-7,j+5) * (-0.00030525030525030525) & - + in(i-6,j+5) * (-0.00042087542087542086) & - + in(i-5,j+5) * (-0.0006172839506172839) & - + in(i-4,j+5) * (-0.000992063492063492) & - + in(i-3,j+5) * (-0.001851851851851852) & - + in(i-2,j+5) * (-0.004629629629629629) & - + in(i-1,j+5) * (-0.027777777777777776) & - + in(i+1,j+5) * (0.0006172839506172839) & - + in(i+2,j+5) * (0.0006172839506172839) & - + in(i+3,j+5) * (0.0006172839506172839) & - + in(i+4,j+5) * (0.0006172839506172839) & - + in(i+5,j+5) * (0.005555555555555556) & - + in(i+6,j+5) * (0.00042087542087542086) & - + in(i+7,j+5) * (0.00030525030525030525) & - + in(i+8,j+5) * (0.0002314814814814815) & - + in(i+9,j+5) * (0.00018155410312273057) & - + in(i-9,j+6) * (-0.00018155410312273057) & - + in(i-8,j+6) * (-0.0002314814814814815) & - + in(i-7,j+6) * (-0.00030525030525030525) & - + in(i-6,j+6) * (-0.00042087542087542086) & - + in(i-5,j+6) * (-0.0006172839506172839) & - + in(i-4,j+6) * (-0.000992063492063492) & - + in(i-3,j+6) * (-0.001851851851851852) & - + in(i-2,j+6) * (-0.004629629629629629) & - + in(i-1,j+6) * (-0.027777777777777776) & - + in(i+1,j+6) * (0.00042087542087542086) & - + in(i+2,j+6) * (0.00042087542087542086) & - + in(i+3,j+6) * (0.00042087542087542086) & - + in(i+4,j+6) * (0.00042087542087542086) & - + in(i+5,j+6) * (0.00042087542087542086) & - + in(i+6,j+6) * (0.004629629629629629) & - + in(i+7,j+6) * (0.00030525030525030525) & - + in(i+8,j+6) * (0.0002314814814814815) & - + in(i+9,j+6) * (0.00018155410312273057) & - + in(i-9,j+7) * (-0.00018155410312273057) & - + in(i-8,j+7) * (-0.0002314814814814815) & - + in(i-7,j+7) * (-0.00030525030525030525) & - + in(i-6,j+7) * (-0.00042087542087542086) & - + in(i-5,j+7) * (-0.0006172839506172839) & - + in(i-4,j+7) * (-0.000992063492063492) & - + in(i-3,j+7) * (-0.001851851851851852) & - + in(i-2,j+7) * (-0.004629629629629629) & - + in(i-1,j+7) * (-0.027777777777777776) & - + in(i+1,j+7) * (0.00030525030525030525) & - + in(i+2,j+7) * (0.00030525030525030525) & - + in(i+3,j+7) * (0.00030525030525030525) & - + in(i+4,j+7) * (0.00030525030525030525) & - + in(i+5,j+7) * (0.00030525030525030525) & - + in(i+6,j+7) * (0.00030525030525030525) & - + in(i+7,j+7) * (0.003968253968253968) & - + in(i+8,j+7) * (0.0002314814814814815) & - + in(i+9,j+7) * (0.00018155410312273057) & - + in(i-9,j+8) * (-0.00018155410312273057) & - + in(i-8,j+8) * (-0.0002314814814814815) & - + in(i-7,j+8) * (-0.00030525030525030525) & - + in(i-6,j+8) * (-0.00042087542087542086) & - + in(i-5,j+8) * (-0.0006172839506172839) & - + in(i-4,j+8) * (-0.000992063492063492) & - + in(i-3,j+8) * (-0.001851851851851852) & - + in(i-2,j+8) * (-0.004629629629629629) & - + in(i-1,j+8) * (-0.027777777777777776) & - + in(i+1,j+8) * (0.0002314814814814815) & - + in(i+2,j+8) * (0.0002314814814814815) & - + in(i+3,j+8) * (0.0002314814814814815) & - + in(i+4,j+8) * (0.0002314814814814815) & - + in(i+5,j+8) * (0.0002314814814814815) & - + in(i+6,j+8) * (0.0002314814814814815) & - + in(i+7,j+8) * (0.0002314814814814815) & - + in(i+8,j+8) * (0.003472222222222222) & - + in(i+9,j+8) * (0.00018155410312273057) & - + in(i-9,j+9) * (-0.00018155410312273057) & - + in(i-8,j+9) * (-0.0002314814814814815) & - + in(i-7,j+9) * (-0.00030525030525030525) & - + in(i-6,j+9) * (-0.00042087542087542086) & - + in(i-5,j+9) * (-0.0006172839506172839) & - + in(i-4,j+9) * (-0.000992063492063492) & - + in(i-3,j+9) * (-0.001851851851851852) & - + in(i-2,j+9) * (-0.004629629629629629) & - + in(i-1,j+9) * (-0.027777777777777776) & - + in(i+1,j+9) * (0.00018155410312273057) & - + in(i+2,j+9) * (0.00018155410312273057) & - + in(i+3,j+9) * (0.00018155410312273057) & - + in(i+4,j+9) * (0.00018155410312273057) & - + in(i+5,j+9) * (0.00018155410312273057) & - + in(i+6,j+9) * (0.00018155410312273057) & - + in(i+7,j+9) * (0.00018155410312273057) & - + in(i+8,j+9) * (0.00018155410312273057) & - + in(i+9,j+9) * (0.0030864197530864196) & + + in(i-9,j-9) * (-0.0030864197530864196d0) & + + in(i+1,j-9) * (-0.00018155410312273057d0) & + + in(i+2,j-9) * (-0.00018155410312273057d0) & + + in(i+3,j-9) * (-0.00018155410312273057d0) & + + in(i+4,j-9) * (-0.00018155410312273057d0) & + + in(i+5,j-9) * (-0.00018155410312273057d0) & + + in(i+6,j-9) * (-0.00018155410312273057d0) & + + in(i+7,j-9) * (-0.00018155410312273057d0) & + + in(i+8,j-9) * (-0.00018155410312273057d0) & + + in(i+9,j-9) * (-0.00018155410312273057d0) & + + in(i-8,j-8) * (-0.003472222222222222d0) & + + in(i+1,j-8) * (-0.0002314814814814815d0) & + + in(i+2,j-8) * (-0.0002314814814814815d0) & + + in(i+3,j-8) * (-0.0002314814814814815d0) & + + in(i+4,j-8) * (-0.0002314814814814815d0) & + + in(i+5,j-8) * (-0.0002314814814814815d0) & + + in(i+6,j-8) * (-0.0002314814814814815d0) & + + in(i+7,j-8) * (-0.0002314814814814815d0) & + + in(i+8,j-8) * (-0.0002314814814814815d0) & + + in(i+9,j-8) * (-0.0002314814814814815d0) & + + in(i-7,j-7) * (-0.003968253968253968d0) & + + in(i+1,j-7) * (-0.00030525030525030525d0) & + + in(i+2,j-7) * (-0.00030525030525030525d0) & + + in(i+3,j-7) * (-0.00030525030525030525d0) & + + in(i+4,j-7) * (-0.00030525030525030525d0) & + + in(i+5,j-7) * (-0.00030525030525030525d0) & + + in(i+6,j-7) * (-0.00030525030525030525d0) & + + in(i+7,j-7) * (-0.00030525030525030525d0) & + + in(i+8,j-7) * (-0.00030525030525030525d0) & + + in(i+9,j-7) * (-0.00030525030525030525d0) & + + in(i-6,j-6) * (-0.004629629629629629d0) & + + in(i+1,j-6) * (-0.00042087542087542086d0) & + + in(i+2,j-6) * (-0.00042087542087542086d0) & + + in(i+3,j-6) * (-0.00042087542087542086d0) & + + in(i+4,j-6) * (-0.00042087542087542086d0) & + + in(i+5,j-6) * (-0.00042087542087542086d0) & + + in(i+6,j-6) * (-0.00042087542087542086d0) & + + in(i+7,j-6) * (-0.00042087542087542086d0) & + + in(i+8,j-6) * (-0.00042087542087542086d0) & + + in(i+9,j-6) * (-0.00042087542087542086d0) & + + in(i-5,j-5) * (-0.005555555555555556d0) & + + in(i+1,j-5) * (-0.0006172839506172839d0) & + + in(i+2,j-5) * (-0.0006172839506172839d0) & + + in(i+3,j-5) * (-0.0006172839506172839d0) & + + in(i+4,j-5) * (-0.0006172839506172839d0) & + + in(i+5,j-5) * (-0.0006172839506172839d0) & + + in(i+6,j-5) * (-0.0006172839506172839d0) & + + in(i+7,j-5) * (-0.0006172839506172839d0) & + + in(i+8,j-5) * (-0.0006172839506172839d0) & + + in(i+9,j-5) * (-0.0006172839506172839d0) & + + in(i-4,j-4) * (-0.006944444444444444d0) & + + in(i+1,j-4) * (-0.000992063492063492d0) & + + in(i+2,j-4) * (-0.000992063492063492d0) & + + in(i+3,j-4) * (-0.000992063492063492d0) & + + in(i+4,j-4) * (-0.000992063492063492d0) & + + in(i+5,j-4) * (-0.000992063492063492d0) & + + in(i+6,j-4) * (-0.000992063492063492d0) & + + in(i+7,j-4) * (-0.000992063492063492d0) & + + in(i+8,j-4) * (-0.000992063492063492d0) & + + in(i+9,j-4) * (-0.000992063492063492d0) & + + in(i-3,j-3) * (-0.009259259259259259d0) & + + in(i+1,j-3) * (-0.001851851851851852d0) & + + in(i+2,j-3) * (-0.001851851851851852d0) & + + in(i+3,j-3) * (-0.001851851851851852d0) & + + in(i+4,j-3) * (-0.001851851851851852d0) & + + in(i+5,j-3) * (-0.001851851851851852d0) & + + in(i+6,j-3) * (-0.001851851851851852d0) & + + in(i+7,j-3) * (-0.001851851851851852d0) & + + in(i+8,j-3) * (-0.001851851851851852d0) & + + in(i+9,j-3) * (-0.001851851851851852d0) & + + in(i-2,j-2) * (-0.013888888888888888d0) & + + in(i+1,j-2) * (-0.004629629629629629d0) & + + in(i+2,j-2) * (-0.004629629629629629d0) & + + in(i+3,j-2) * (-0.004629629629629629d0) & + + in(i+4,j-2) * (-0.004629629629629629d0) & + + in(i+5,j-2) * (-0.004629629629629629d0) & + + in(i+6,j-2) * (-0.004629629629629629d0) & + + in(i+7,j-2) * (-0.004629629629629629d0) & + + in(i+8,j-2) * (-0.004629629629629629d0) & + + in(i+9,j-2) * (-0.004629629629629629d0) & + + in(i-1,j-1) * (-0.027777777777777776d0) & + + in(i+1,j-1) * (-0.027777777777777776d0) & + + in(i+2,j-1) * (-0.027777777777777776d0) & + + in(i+3,j-1) * (-0.027777777777777776d0) & + + in(i+4,j-1) * (-0.027777777777777776d0) & + + in(i+5,j-1) * (-0.027777777777777776d0) & + + in(i+6,j-1) * (-0.027777777777777776d0) & + + in(i+7,j-1) * (-0.027777777777777776d0) & + + in(i+8,j-1) * (-0.027777777777777776d0) & + + in(i+9,j-1) * (-0.027777777777777776d0) & + + in(i-9,j+1) * (-0.00018155410312273057d0) & + + in(i-8,j+1) * (-0.0002314814814814815d0) & + + in(i-7,j+1) * (-0.00030525030525030525d0) & + + in(i-6,j+1) * (-0.00042087542087542086d0) & + + in(i-5,j+1) * (-0.0006172839506172839d0) & + + in(i-4,j+1) * (-0.000992063492063492d0) & + + in(i-3,j+1) * (-0.001851851851851852d0) & + + in(i-2,j+1) * (-0.004629629629629629d0) & + + in(i-1,j+1) * (-0.027777777777777776d0) & + + in(i+1,j+1) * (0.027777777777777776d0) & + + in(i+2,j+1) * (0.004629629629629629d0) & + + in(i+3,j+1) * (0.001851851851851852d0) & + + in(i+4,j+1) * (0.000992063492063492d0) & + + in(i+5,j+1) * (0.0006172839506172839d0) & + + in(i+6,j+1) * (0.00042087542087542086d0) & + + in(i+7,j+1) * (0.00030525030525030525d0) & + + in(i+8,j+1) * (0.0002314814814814815d0) & + + in(i+9,j+1) * (0.00018155410312273057d0) & + + in(i-9,j+2) * (-0.00018155410312273057d0) & + + in(i-8,j+2) * (-0.0002314814814814815d0) & + + in(i-7,j+2) * (-0.00030525030525030525d0) & + + in(i-6,j+2) * (-0.00042087542087542086d0) & + + in(i-5,j+2) * (-0.0006172839506172839d0) & + + in(i-4,j+2) * (-0.000992063492063492d0) & + + in(i-3,j+2) * (-0.001851851851851852d0) & + + in(i-2,j+2) * (-0.004629629629629629d0) & + + in(i-1,j+2) * (-0.027777777777777776d0) & + + in(i+1,j+2) * (0.004629629629629629d0) & + + in(i+2,j+2) * (0.013888888888888888d0) & + + in(i+3,j+2) * (0.001851851851851852d0) & + + in(i+4,j+2) * (0.000992063492063492d0) & + + in(i+5,j+2) * (0.0006172839506172839d0) & + + in(i+6,j+2) * (0.00042087542087542086d0) & + + in(i+7,j+2) * (0.00030525030525030525d0) & + + in(i+8,j+2) * (0.0002314814814814815d0) & + + in(i+9,j+2) * (0.00018155410312273057d0) & + + in(i-9,j+3) * (-0.00018155410312273057d0) & + + in(i-8,j+3) * (-0.0002314814814814815d0) & + + in(i-7,j+3) * (-0.00030525030525030525d0) & + + in(i-6,j+3) * (-0.00042087542087542086d0) & + + in(i-5,j+3) * (-0.0006172839506172839d0) & + + in(i-4,j+3) * (-0.000992063492063492d0) & + + in(i-3,j+3) * (-0.001851851851851852d0) & + + in(i-2,j+3) * (-0.004629629629629629d0) & + + in(i-1,j+3) * (-0.027777777777777776d0) & + + in(i+1,j+3) * (0.001851851851851852d0) & + + in(i+2,j+3) * (0.001851851851851852d0) & + + in(i+3,j+3) * (0.009259259259259259d0) & + + in(i+4,j+3) * (0.000992063492063492d0) & + + in(i+5,j+3) * (0.0006172839506172839d0) & + + in(i+6,j+3) * (0.00042087542087542086d0) & + + in(i+7,j+3) * (0.00030525030525030525d0) & + + in(i+8,j+3) * (0.0002314814814814815d0) & + + in(i+9,j+3) * (0.00018155410312273057d0) & + + in(i-9,j+4) * (-0.00018155410312273057d0) & + + in(i-8,j+4) * (-0.0002314814814814815d0) & + + in(i-7,j+4) * (-0.00030525030525030525d0) & + + in(i-6,j+4) * (-0.00042087542087542086d0) & + + in(i-5,j+4) * (-0.0006172839506172839d0) & + + in(i-4,j+4) * (-0.000992063492063492d0) & + + in(i-3,j+4) * (-0.001851851851851852d0) & + + in(i-2,j+4) * (-0.004629629629629629d0) & + + in(i-1,j+4) * (-0.027777777777777776d0) & + + in(i+1,j+4) * (0.000992063492063492d0) & + + in(i+2,j+4) * (0.000992063492063492d0) & + + in(i+3,j+4) * (0.000992063492063492d0) & + + in(i+4,j+4) * (0.006944444444444444d0) & + + in(i+5,j+4) * (0.0006172839506172839d0) & + + in(i+6,j+4) * (0.00042087542087542086d0) & + + in(i+7,j+4) * (0.00030525030525030525d0) & + + in(i+8,j+4) * (0.0002314814814814815d0) & + + in(i+9,j+4) * (0.00018155410312273057d0) & + + in(i-9,j+5) * (-0.00018155410312273057d0) & + + in(i-8,j+5) * (-0.0002314814814814815d0) & + + in(i-7,j+5) * (-0.00030525030525030525d0) & + + in(i-6,j+5) * (-0.00042087542087542086d0) & + + in(i-5,j+5) * (-0.0006172839506172839d0) & + + in(i-4,j+5) * (-0.000992063492063492d0) & + + in(i-3,j+5) * (-0.001851851851851852d0) & + + in(i-2,j+5) * (-0.004629629629629629d0) & + + in(i-1,j+5) * (-0.027777777777777776d0) & + + in(i+1,j+5) * (0.0006172839506172839d0) & + + in(i+2,j+5) * (0.0006172839506172839d0) & + + in(i+3,j+5) * (0.0006172839506172839d0) & + + in(i+4,j+5) * (0.0006172839506172839d0) & + + in(i+5,j+5) * (0.005555555555555556d0) & + + in(i+6,j+5) * (0.00042087542087542086d0) & + + in(i+7,j+5) * (0.00030525030525030525d0) & + + in(i+8,j+5) * (0.0002314814814814815d0) & + + in(i+9,j+5) * (0.00018155410312273057d0) & + + in(i-9,j+6) * (-0.00018155410312273057d0) & + + in(i-8,j+6) * (-0.0002314814814814815d0) & + + in(i-7,j+6) * (-0.00030525030525030525d0) & + + in(i-6,j+6) * (-0.00042087542087542086d0) & + + in(i-5,j+6) * (-0.0006172839506172839d0) & + + in(i-4,j+6) * (-0.000992063492063492d0) & + + in(i-3,j+6) * (-0.001851851851851852d0) & + + in(i-2,j+6) * (-0.004629629629629629d0) & + + in(i-1,j+6) * (-0.027777777777777776d0) & + + in(i+1,j+6) * (0.00042087542087542086d0) & + + in(i+2,j+6) * (0.00042087542087542086d0) & + + in(i+3,j+6) * (0.00042087542087542086d0) & + + in(i+4,j+6) * (0.00042087542087542086d0) & + + in(i+5,j+6) * (0.00042087542087542086d0) & + + in(i+6,j+6) * (0.004629629629629629d0) & + + in(i+7,j+6) * (0.00030525030525030525d0) & + + in(i+8,j+6) * (0.0002314814814814815d0) & + + in(i+9,j+6) * (0.00018155410312273057d0) & + + in(i-9,j+7) * (-0.00018155410312273057d0) & + + in(i-8,j+7) * (-0.0002314814814814815d0) & + + in(i-7,j+7) * (-0.00030525030525030525d0) & + + in(i-6,j+7) * (-0.00042087542087542086d0) & + + in(i-5,j+7) * (-0.0006172839506172839d0) & + + in(i-4,j+7) * (-0.000992063492063492d0) & + + in(i-3,j+7) * (-0.001851851851851852d0) & + + in(i-2,j+7) * (-0.004629629629629629d0) & + + in(i-1,j+7) * (-0.027777777777777776d0) & + + in(i+1,j+7) * (0.00030525030525030525d0) & + + in(i+2,j+7) * (0.00030525030525030525d0) & + + in(i+3,j+7) * (0.00030525030525030525d0) & + + in(i+4,j+7) * (0.00030525030525030525d0) & + + in(i+5,j+7) * (0.00030525030525030525d0) & + + in(i+6,j+7) * (0.00030525030525030525d0) & + + in(i+7,j+7) * (0.003968253968253968d0) & + + in(i+8,j+7) * (0.0002314814814814815d0) & + + in(i+9,j+7) * (0.00018155410312273057d0) & + + in(i-9,j+8) * (-0.00018155410312273057d0) & + + in(i-8,j+8) * (-0.0002314814814814815d0) & + + in(i-7,j+8) * (-0.00030525030525030525d0) & + + in(i-6,j+8) * (-0.00042087542087542086d0) & + + in(i-5,j+8) * (-0.0006172839506172839d0) & + + in(i-4,j+8) * (-0.000992063492063492d0) & + + in(i-3,j+8) * (-0.001851851851851852d0) & + + in(i-2,j+8) * (-0.004629629629629629d0) & + + in(i-1,j+8) * (-0.027777777777777776d0) & + + in(i+1,j+8) * (0.0002314814814814815d0) & + + in(i+2,j+8) * (0.0002314814814814815d0) & + + in(i+3,j+8) * (0.0002314814814814815d0) & + + in(i+4,j+8) * (0.0002314814814814815d0) & + + in(i+5,j+8) * (0.0002314814814814815d0) & + + in(i+6,j+8) * (0.0002314814814814815d0) & + + in(i+7,j+8) * (0.0002314814814814815d0) & + + in(i+8,j+8) * (0.003472222222222222d0) & + + in(i+9,j+8) * (0.00018155410312273057d0) & + + in(i-9,j+9) * (-0.00018155410312273057d0) & + + in(i-8,j+9) * (-0.0002314814814814815d0) & + + in(i-7,j+9) * (-0.00030525030525030525d0) & + + in(i-6,j+9) * (-0.00042087542087542086d0) & + + in(i-5,j+9) * (-0.0006172839506172839d0) & + + in(i-4,j+9) * (-0.000992063492063492d0) & + + in(i-3,j+9) * (-0.001851851851851852d0) & + + in(i-2,j+9) * (-0.004629629629629629d0) & + + in(i-1,j+9) * (-0.027777777777777776d0) & + + in(i+1,j+9) * (0.00018155410312273057d0) & + + in(i+2,j+9) * (0.00018155410312273057d0) & + + in(i+3,j+9) * (0.00018155410312273057d0) & + + in(i+4,j+9) * (0.00018155410312273057d0) & + + in(i+5,j+9) * (0.00018155410312273057d0) & + + in(i+6,j+9) * (0.00018155410312273057d0) & + + in(i+7,j+9) * (0.00018155410312273057d0) & + + in(i+8,j+9) * (0.00018155410312273057d0) & + + in(i+9,j+9) * (0.0030864197530864196d0) & +0.0 end do end do diff --git a/FORTRAN/stencil_serial.f90 b/FORTRAN/stencil_serial.f90 index 5e2b50d4e..cb4bf8052 100644 --- a/FORTRAN/stencil_serial.f90 +++ b/FORTRAN/stencil_serial.f90 @@ -8,10 +8,10 @@ subroutine star1(n, in, out) do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i+0,j-1) * (-0.5) & - + in(i-1,j+0) * (-0.5) & - + in(i+1,j+0) * (0.5) & - + in(i+0,j+1) * (0.5) & + + in(i+0,j-1) * (-0.5d0) & + + in(i-1,j+0) * (-0.5d0) & + + in(i+1,j+0) * (0.5d0) & + + in(i+0,j+1) * (0.5d0) & +0.0 end do end do @@ -27,14 +27,14 @@ subroutine star2(n, in, out) do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i+0,j-2) * (-0.125) & - + in(i+0,j-1) * (-0.25) & - + in(i-2,j+0) * (-0.125) & - + in(i-1,j+0) * (-0.25) & - + in(i+1,j+0) * (0.25) & - + in(i+2,j+0) * (0.125) & - + in(i+0,j+1) * (0.25) & - + in(i+0,j+2) * (0.125) & + + in(i+0,j-2) * (-0.125d0) & + + in(i+0,j-1) * (-0.25d0) & + + in(i-2,j+0) * (-0.125d0) & + + in(i-1,j+0) * (-0.25d0) & + + in(i+1,j+0) * (0.25d0) & + + in(i+2,j+0) * (0.125d0) & + + in(i+0,j+1) * (0.25d0) & + + in(i+0,j+2) * (0.125d0) & +0.0 end do end do @@ -50,18 +50,18 @@ subroutine star3(n, in, out) do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i+0,j-3) * (-0.05555555555555555) & - + in(i+0,j-2) * (-0.08333333333333333) & - + in(i+0,j-1) * (-0.16666666666666666) & - + in(i-3,j+0) * (-0.05555555555555555) & - + in(i-2,j+0) * (-0.08333333333333333) & - + in(i-1,j+0) * (-0.16666666666666666) & - + in(i+1,j+0) * (0.16666666666666666) & - + in(i+2,j+0) * (0.08333333333333333) & - + in(i+3,j+0) * (0.05555555555555555) & - + in(i+0,j+1) * (0.16666666666666666) & - + in(i+0,j+2) * (0.08333333333333333) & - + in(i+0,j+3) * (0.05555555555555555) & + + in(i+0,j-3) * (-0.05555555555555555d0) & + + in(i+0,j-2) * (-0.08333333333333333d0) & + + in(i+0,j-1) * (-0.16666666666666666d0) & + + in(i-3,j+0) * (-0.05555555555555555d0) & + + in(i-2,j+0) * (-0.08333333333333333d0) & + + in(i-1,j+0) * (-0.16666666666666666d0) & + + in(i+1,j+0) * (0.16666666666666666d0) & + + in(i+2,j+0) * (0.08333333333333333d0) & + + in(i+3,j+0) * (0.05555555555555555d0) & + + in(i+0,j+1) * (0.16666666666666666d0) & + + in(i+0,j+2) * (0.08333333333333333d0) & + + in(i+0,j+3) * (0.05555555555555555d0) & +0.0 end do end do @@ -77,22 +77,22 @@ subroutine star4(n, in, out) do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i+0,j-4) * (-0.03125) & - + in(i+0,j-3) * (-0.041666666666666664) & - + in(i+0,j-2) * (-0.0625) & - + in(i+0,j-1) * (-0.125) & - + in(i-4,j+0) * (-0.03125) & - + in(i-3,j+0) * (-0.041666666666666664) & - + in(i-2,j+0) * (-0.0625) & - + in(i-1,j+0) * (-0.125) & - + in(i+1,j+0) * (0.125) & - + in(i+2,j+0) * (0.0625) & - + in(i+3,j+0) * (0.041666666666666664) & - + in(i+4,j+0) * (0.03125) & - + in(i+0,j+1) * (0.125) & - + in(i+0,j+2) * (0.0625) & - + in(i+0,j+3) * (0.041666666666666664) & - + in(i+0,j+4) * (0.03125) & + + in(i+0,j-4) * (-0.03125d0) & + + in(i+0,j-3) * (-0.041666666666666664d0) & + + in(i+0,j-2) * (-0.0625d0) & + + in(i+0,j-1) * (-0.125d0) & + + in(i-4,j+0) * (-0.03125d0) & + + in(i-3,j+0) * (-0.041666666666666664d0) & + + in(i-2,j+0) * (-0.0625d0) & + + in(i-1,j+0) * (-0.125d0) & + + in(i+1,j+0) * (0.125d0) & + + in(i+2,j+0) * (0.0625d0) & + + in(i+3,j+0) * (0.041666666666666664d0) & + + in(i+4,j+0) * (0.03125d0) & + + in(i+0,j+1) * (0.125d0) & + + in(i+0,j+2) * (0.0625d0) & + + in(i+0,j+3) * (0.041666666666666664d0) & + + in(i+0,j+4) * (0.03125d0) & +0.0 end do end do @@ -108,26 +108,26 @@ subroutine star5(n, in, out) do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i+0,j-5) * (-0.02) & - + in(i+0,j-4) * (-0.025) & - + in(i+0,j-3) * (-0.03333333333333333) & - + in(i+0,j-2) * (-0.05) & - + in(i+0,j-1) * (-0.1) & - + in(i-5,j+0) * (-0.02) & - + in(i-4,j+0) * (-0.025) & - + in(i-3,j+0) * (-0.03333333333333333) & - + in(i-2,j+0) * (-0.05) & - + in(i-1,j+0) * (-0.1) & - + in(i+1,j+0) * (0.1) & - + in(i+2,j+0) * (0.05) & - + in(i+3,j+0) * (0.03333333333333333) & - + in(i+4,j+0) * (0.025) & - + in(i+5,j+0) * (0.02) & - + in(i+0,j+1) * (0.1) & - + in(i+0,j+2) * (0.05) & - + in(i+0,j+3) * (0.03333333333333333) & - + in(i+0,j+4) * (0.025) & - + in(i+0,j+5) * (0.02) & + + in(i+0,j-5) * (-0.02d0) & + + in(i+0,j-4) * (-0.025d0) & + + in(i+0,j-3) * (-0.03333333333333333d0) & + + in(i+0,j-2) * (-0.05d0) & + + in(i+0,j-1) * (-0.1d0) & + + in(i-5,j+0) * (-0.02d0) & + + in(i-4,j+0) * (-0.025d0) & + + in(i-3,j+0) * (-0.03333333333333333d0) & + + in(i-2,j+0) * (-0.05d0) & + + in(i-1,j+0) * (-0.1d0) & + + in(i+1,j+0) * (0.1d0) & + + in(i+2,j+0) * (0.05d0) & + + in(i+3,j+0) * (0.03333333333333333d0) & + + in(i+4,j+0) * (0.025d0) & + + in(i+5,j+0) * (0.02d0) & + + in(i+0,j+1) * (0.1d0) & + + in(i+0,j+2) * (0.05d0) & + + in(i+0,j+3) * (0.03333333333333333d0) & + + in(i+0,j+4) * (0.025d0) & + + in(i+0,j+5) * (0.02d0) & +0.0 end do end do @@ -143,30 +143,30 @@ subroutine star6(n, in, out) do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i+0,j-6) * (-0.013888888888888888) & - + in(i+0,j-5) * (-0.016666666666666666) & - + in(i+0,j-4) * (-0.020833333333333332) & - + in(i+0,j-3) * (-0.027777777777777776) & - + in(i+0,j-2) * (-0.041666666666666664) & - + in(i+0,j-1) * (-0.08333333333333333) & - + in(i-6,j+0) * (-0.013888888888888888) & - + in(i-5,j+0) * (-0.016666666666666666) & - + in(i-4,j+0) * (-0.020833333333333332) & - + in(i-3,j+0) * (-0.027777777777777776) & - + in(i-2,j+0) * (-0.041666666666666664) & - + in(i-1,j+0) * (-0.08333333333333333) & - + in(i+1,j+0) * (0.08333333333333333) & - + in(i+2,j+0) * (0.041666666666666664) & - + in(i+3,j+0) * (0.027777777777777776) & - + in(i+4,j+0) * (0.020833333333333332) & - + in(i+5,j+0) * (0.016666666666666666) & - + in(i+6,j+0) * (0.013888888888888888) & - + in(i+0,j+1) * (0.08333333333333333) & - + in(i+0,j+2) * (0.041666666666666664) & - + in(i+0,j+3) * (0.027777777777777776) & - + in(i+0,j+4) * (0.020833333333333332) & - + in(i+0,j+5) * (0.016666666666666666) & - + in(i+0,j+6) * (0.013888888888888888) & + + in(i+0,j-6) * (-0.013888888888888888d0) & + + in(i+0,j-5) * (-0.016666666666666666d0) & + + in(i+0,j-4) * (-0.020833333333333332d0) & + + in(i+0,j-3) * (-0.027777777777777776d0) & + + in(i+0,j-2) * (-0.041666666666666664d0) & + + in(i+0,j-1) * (-0.08333333333333333d0) & + + in(i-6,j+0) * (-0.013888888888888888d0) & + + in(i-5,j+0) * (-0.016666666666666666d0) & + + in(i-4,j+0) * (-0.020833333333333332d0) & + + in(i-3,j+0) * (-0.027777777777777776d0) & + + in(i-2,j+0) * (-0.041666666666666664d0) & + + in(i-1,j+0) * (-0.08333333333333333d0) & + + in(i+1,j+0) * (0.08333333333333333d0) & + + in(i+2,j+0) * (0.041666666666666664d0) & + + in(i+3,j+0) * (0.027777777777777776d0) & + + in(i+4,j+0) * (0.020833333333333332d0) & + + in(i+5,j+0) * (0.016666666666666666d0) & + + in(i+6,j+0) * (0.013888888888888888d0) & + + in(i+0,j+1) * (0.08333333333333333d0) & + + in(i+0,j+2) * (0.041666666666666664d0) & + + in(i+0,j+3) * (0.027777777777777776d0) & + + in(i+0,j+4) * (0.020833333333333332d0) & + + in(i+0,j+5) * (0.016666666666666666d0) & + + in(i+0,j+6) * (0.013888888888888888d0) & +0.0 end do end do @@ -182,34 +182,34 @@ subroutine star7(n, in, out) do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i+0,j-7) * (-0.01020408163265306) & - + in(i+0,j-6) * (-0.011904761904761904) & - + in(i+0,j-5) * (-0.014285714285714285) & - + in(i+0,j-4) * (-0.017857142857142856) & - + in(i+0,j-3) * (-0.023809523809523808) & - + in(i+0,j-2) * (-0.03571428571428571) & - + in(i+0,j-1) * (-0.07142857142857142) & - + in(i-7,j+0) * (-0.01020408163265306) & - + in(i-6,j+0) * (-0.011904761904761904) & - + in(i-5,j+0) * (-0.014285714285714285) & - + in(i-4,j+0) * (-0.017857142857142856) & - + in(i-3,j+0) * (-0.023809523809523808) & - + in(i-2,j+0) * (-0.03571428571428571) & - + in(i-1,j+0) * (-0.07142857142857142) & - + in(i+1,j+0) * (0.07142857142857142) & - + in(i+2,j+0) * (0.03571428571428571) & - + in(i+3,j+0) * (0.023809523809523808) & - + in(i+4,j+0) * (0.017857142857142856) & - + in(i+5,j+0) * (0.014285714285714285) & - + in(i+6,j+0) * (0.011904761904761904) & - + in(i+7,j+0) * (0.01020408163265306) & - + in(i+0,j+1) * (0.07142857142857142) & - + in(i+0,j+2) * (0.03571428571428571) & - + in(i+0,j+3) * (0.023809523809523808) & - + in(i+0,j+4) * (0.017857142857142856) & - + in(i+0,j+5) * (0.014285714285714285) & - + in(i+0,j+6) * (0.011904761904761904) & - + in(i+0,j+7) * (0.01020408163265306) & + + in(i+0,j-7) * (-0.01020408163265306d0) & + + in(i+0,j-6) * (-0.011904761904761904d0) & + + in(i+0,j-5) * (-0.014285714285714285d0) & + + in(i+0,j-4) * (-0.017857142857142856d0) & + + in(i+0,j-3) * (-0.023809523809523808d0) & + + in(i+0,j-2) * (-0.03571428571428571d0) & + + in(i+0,j-1) * (-0.07142857142857142d0) & + + in(i-7,j+0) * (-0.01020408163265306d0) & + + in(i-6,j+0) * (-0.011904761904761904d0) & + + in(i-5,j+0) * (-0.014285714285714285d0) & + + in(i-4,j+0) * (-0.017857142857142856d0) & + + in(i-3,j+0) * (-0.023809523809523808d0) & + + in(i-2,j+0) * (-0.03571428571428571d0) & + + in(i-1,j+0) * (-0.07142857142857142d0) & + + in(i+1,j+0) * (0.07142857142857142d0) & + + in(i+2,j+0) * (0.03571428571428571d0) & + + in(i+3,j+0) * (0.023809523809523808d0) & + + in(i+4,j+0) * (0.017857142857142856d0) & + + in(i+5,j+0) * (0.014285714285714285d0) & + + in(i+6,j+0) * (0.011904761904761904d0) & + + in(i+7,j+0) * (0.01020408163265306d0) & + + in(i+0,j+1) * (0.07142857142857142d0) & + + in(i+0,j+2) * (0.03571428571428571d0) & + + in(i+0,j+3) * (0.023809523809523808d0) & + + in(i+0,j+4) * (0.017857142857142856d0) & + + in(i+0,j+5) * (0.014285714285714285d0) & + + in(i+0,j+6) * (0.011904761904761904d0) & + + in(i+0,j+7) * (0.01020408163265306d0) & +0.0 end do end do @@ -225,38 +225,38 @@ subroutine star8(n, in, out) do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i+0,j-8) * (-0.0078125) & - + in(i+0,j-7) * (-0.008928571428571428) & - + in(i+0,j-6) * (-0.010416666666666666) & - + in(i+0,j-5) * (-0.0125) & - + in(i+0,j-4) * (-0.015625) & - + in(i+0,j-3) * (-0.020833333333333332) & - + in(i+0,j-2) * (-0.03125) & - + in(i+0,j-1) * (-0.0625) & - + in(i-8,j+0) * (-0.0078125) & - + in(i-7,j+0) * (-0.008928571428571428) & - + in(i-6,j+0) * (-0.010416666666666666) & - + in(i-5,j+0) * (-0.0125) & - + in(i-4,j+0) * (-0.015625) & - + in(i-3,j+0) * (-0.020833333333333332) & - + in(i-2,j+0) * (-0.03125) & - + in(i-1,j+0) * (-0.0625) & - + in(i+1,j+0) * (0.0625) & - + in(i+2,j+0) * (0.03125) & - + in(i+3,j+0) * (0.020833333333333332) & - + in(i+4,j+0) * (0.015625) & - + in(i+5,j+0) * (0.0125) & - + in(i+6,j+0) * (0.010416666666666666) & - + in(i+7,j+0) * (0.008928571428571428) & - + in(i+8,j+0) * (0.0078125) & - + in(i+0,j+1) * (0.0625) & - + in(i+0,j+2) * (0.03125) & - + in(i+0,j+3) * (0.020833333333333332) & - + in(i+0,j+4) * (0.015625) & - + in(i+0,j+5) * (0.0125) & - + in(i+0,j+6) * (0.010416666666666666) & - + in(i+0,j+7) * (0.008928571428571428) & - + in(i+0,j+8) * (0.0078125) & + + in(i+0,j-8) * (-0.0078125d0) & + + in(i+0,j-7) * (-0.008928571428571428d0) & + + in(i+0,j-6) * (-0.010416666666666666d0) & + + in(i+0,j-5) * (-0.0125d0) & + + in(i+0,j-4) * (-0.015625d0) & + + in(i+0,j-3) * (-0.020833333333333332d0) & + + in(i+0,j-2) * (-0.03125d0) & + + in(i+0,j-1) * (-0.0625d0) & + + in(i-8,j+0) * (-0.0078125d0) & + + in(i-7,j+0) * (-0.008928571428571428d0) & + + in(i-6,j+0) * (-0.010416666666666666d0) & + + in(i-5,j+0) * (-0.0125d0) & + + in(i-4,j+0) * (-0.015625d0) & + + in(i-3,j+0) * (-0.020833333333333332d0) & + + in(i-2,j+0) * (-0.03125d0) & + + in(i-1,j+0) * (-0.0625d0) & + + in(i+1,j+0) * (0.0625d0) & + + in(i+2,j+0) * (0.03125d0) & + + in(i+3,j+0) * (0.020833333333333332d0) & + + in(i+4,j+0) * (0.015625d0) & + + in(i+5,j+0) * (0.0125d0) & + + in(i+6,j+0) * (0.010416666666666666d0) & + + in(i+7,j+0) * (0.008928571428571428d0) & + + in(i+8,j+0) * (0.0078125d0) & + + in(i+0,j+1) * (0.0625d0) & + + in(i+0,j+2) * (0.03125d0) & + + in(i+0,j+3) * (0.020833333333333332d0) & + + in(i+0,j+4) * (0.015625d0) & + + in(i+0,j+5) * (0.0125d0) & + + in(i+0,j+6) * (0.010416666666666666d0) & + + in(i+0,j+7) * (0.008928571428571428d0) & + + in(i+0,j+8) * (0.0078125d0) & +0.0 end do end do @@ -272,42 +272,42 @@ subroutine star9(n, in, out) do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i+0,j-9) * (-0.006172839506172839) & - + in(i+0,j-8) * (-0.006944444444444444) & - + in(i+0,j-7) * (-0.007936507936507936) & - + in(i+0,j-6) * (-0.009259259259259259) & - + in(i+0,j-5) * (-0.011111111111111112) & - + in(i+0,j-4) * (-0.013888888888888888) & - + in(i+0,j-3) * (-0.018518518518518517) & - + in(i+0,j-2) * (-0.027777777777777776) & - + in(i+0,j-1) * (-0.05555555555555555) & - + in(i-9,j+0) * (-0.006172839506172839) & - + in(i-8,j+0) * (-0.006944444444444444) & - + in(i-7,j+0) * (-0.007936507936507936) & - + in(i-6,j+0) * (-0.009259259259259259) & - + in(i-5,j+0) * (-0.011111111111111112) & - + in(i-4,j+0) * (-0.013888888888888888) & - + in(i-3,j+0) * (-0.018518518518518517) & - + in(i-2,j+0) * (-0.027777777777777776) & - + in(i-1,j+0) * (-0.05555555555555555) & - + in(i+1,j+0) * (0.05555555555555555) & - + in(i+2,j+0) * (0.027777777777777776) & - + in(i+3,j+0) * (0.018518518518518517) & - + in(i+4,j+0) * (0.013888888888888888) & - + in(i+5,j+0) * (0.011111111111111112) & - + in(i+6,j+0) * (0.009259259259259259) & - + in(i+7,j+0) * (0.007936507936507936) & - + in(i+8,j+0) * (0.006944444444444444) & - + in(i+9,j+0) * (0.006172839506172839) & - + in(i+0,j+1) * (0.05555555555555555) & - + in(i+0,j+2) * (0.027777777777777776) & - + in(i+0,j+3) * (0.018518518518518517) & - + in(i+0,j+4) * (0.013888888888888888) & - + in(i+0,j+5) * (0.011111111111111112) & - + in(i+0,j+6) * (0.009259259259259259) & - + in(i+0,j+7) * (0.007936507936507936) & - + in(i+0,j+8) * (0.006944444444444444) & - + in(i+0,j+9) * (0.006172839506172839) & + + in(i+0,j-9) * (-0.006172839506172839d0) & + + in(i+0,j-8) * (-0.006944444444444444d0) & + + in(i+0,j-7) * (-0.007936507936507936d0) & + + in(i+0,j-6) * (-0.009259259259259259d0) & + + in(i+0,j-5) * (-0.011111111111111112d0) & + + in(i+0,j-4) * (-0.013888888888888888d0) & + + in(i+0,j-3) * (-0.018518518518518517d0) & + + in(i+0,j-2) * (-0.027777777777777776d0) & + + in(i+0,j-1) * (-0.05555555555555555d0) & + + in(i-9,j+0) * (-0.006172839506172839d0) & + + in(i-8,j+0) * (-0.006944444444444444d0) & + + in(i-7,j+0) * (-0.007936507936507936d0) & + + in(i-6,j+0) * (-0.009259259259259259d0) & + + in(i-5,j+0) * (-0.011111111111111112d0) & + + in(i-4,j+0) * (-0.013888888888888888d0) & + + in(i-3,j+0) * (-0.018518518518518517d0) & + + in(i-2,j+0) * (-0.027777777777777776d0) & + + in(i-1,j+0) * (-0.05555555555555555d0) & + + in(i+1,j+0) * (0.05555555555555555d0) & + + in(i+2,j+0) * (0.027777777777777776d0) & + + in(i+3,j+0) * (0.018518518518518517d0) & + + in(i+4,j+0) * (0.013888888888888888d0) & + + in(i+5,j+0) * (0.011111111111111112d0) & + + in(i+6,j+0) * (0.009259259259259259d0) & + + in(i+7,j+0) * (0.007936507936507936d0) & + + in(i+8,j+0) * (0.006944444444444444d0) & + + in(i+9,j+0) * (0.006172839506172839d0) & + + in(i+0,j+1) * (0.05555555555555555d0) & + + in(i+0,j+2) * (0.027777777777777776d0) & + + in(i+0,j+3) * (0.018518518518518517d0) & + + in(i+0,j+4) * (0.013888888888888888d0) & + + in(i+0,j+5) * (0.011111111111111112d0) & + + in(i+0,j+6) * (0.009259259259259259d0) & + + in(i+0,j+7) * (0.007936507936507936d0) & + + in(i+0,j+8) * (0.006944444444444444d0) & + + in(i+0,j+9) * (0.006172839506172839d0) & +0.0 end do end do @@ -323,10 +323,10 @@ subroutine grid1(n, in, out) do i=1,n-1-1 do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i-1,j-1) * (-0.25) & - + in(i+1,j-1) * (-0.25) & - + in(i-1,j+1) * (-0.25) & - + in(i+1,j+1) * (0.25) & + + in(i-1,j-1) * (-0.25d0) & + + in(i+1,j-1) * (-0.25d0) & + + in(i-1,j+1) * (-0.25d0) & + + in(i+1,j+1) * (0.25d0) & +0.0 end do end do @@ -342,20 +342,20 @@ subroutine grid2(n, in, out) do i=2,n-2-1 do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i-2,j-2) * (-0.0625) & - + in(i+1,j-2) * (-0.020833333333333332) & - + in(i+2,j-2) * (-0.020833333333333332) & - + in(i-1,j-1) * (-0.125) & - + in(i+1,j-1) * (-0.125) & - + in(i+2,j-1) * (-0.125) & - + in(i-2,j+1) * (-0.020833333333333332) & - + in(i-1,j+1) * (-0.125) & - + in(i+1,j+1) * (0.125) & - + in(i+2,j+1) * (0.020833333333333332) & - + in(i-2,j+2) * (-0.020833333333333332) & - + in(i-1,j+2) * (-0.125) & - + in(i+1,j+2) * (0.020833333333333332) & - + in(i+2,j+2) * (0.0625) & + + in(i-2,j-2) * (-0.0625d0) & + + in(i+1,j-2) * (-0.020833333333333332d0) & + + in(i+2,j-2) * (-0.020833333333333332d0) & + + in(i-1,j-1) * (-0.125d0) & + + in(i+1,j-1) * (-0.125d0) & + + in(i+2,j-1) * (-0.125d0) & + + in(i-2,j+1) * (-0.020833333333333332d0) & + + in(i-1,j+1) * (-0.125d0) & + + in(i+1,j+1) * (0.125d0) & + + in(i+2,j+1) * (0.020833333333333332d0) & + + in(i-2,j+2) * (-0.020833333333333332d0) & + + in(i-1,j+2) * (-0.125d0) & + + in(i+1,j+2) * (0.020833333333333332d0) & + + in(i+2,j+2) * (0.0625d0) & +0.0 end do end do @@ -371,36 +371,36 @@ subroutine grid3(n, in, out) do i=3,n-3-1 do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i-3,j-3) * (-0.027777777777777776) & - + in(i+1,j-3) * (-0.005555555555555556) & - + in(i+2,j-3) * (-0.005555555555555556) & - + in(i+3,j-3) * (-0.005555555555555556) & - + in(i-2,j-2) * (-0.041666666666666664) & - + in(i+1,j-2) * (-0.013888888888888888) & - + in(i+2,j-2) * (-0.013888888888888888) & - + in(i+3,j-2) * (-0.013888888888888888) & - + in(i-1,j-1) * (-0.08333333333333333) & - + in(i+1,j-1) * (-0.08333333333333333) & - + in(i+2,j-1) * (-0.08333333333333333) & - + in(i+3,j-1) * (-0.08333333333333333) & - + in(i-3,j+1) * (-0.005555555555555556) & - + in(i-2,j+1) * (-0.013888888888888888) & - + in(i-1,j+1) * (-0.08333333333333333) & - + in(i+1,j+1) * (0.08333333333333333) & - + in(i+2,j+1) * (0.013888888888888888) & - + in(i+3,j+1) * (0.005555555555555556) & - + in(i-3,j+2) * (-0.005555555555555556) & - + in(i-2,j+2) * (-0.013888888888888888) & - + in(i-1,j+2) * (-0.08333333333333333) & - + in(i+1,j+2) * (0.013888888888888888) & - + in(i+2,j+2) * (0.041666666666666664) & - + in(i+3,j+2) * (0.005555555555555556) & - + in(i-3,j+3) * (-0.005555555555555556) & - + in(i-2,j+3) * (-0.013888888888888888) & - + in(i-1,j+3) * (-0.08333333333333333) & - + in(i+1,j+3) * (0.005555555555555556) & - + in(i+2,j+3) * (0.005555555555555556) & - + in(i+3,j+3) * (0.027777777777777776) & + + in(i-3,j-3) * (-0.027777777777777776d0) & + + in(i+1,j-3) * (-0.005555555555555556d0) & + + in(i+2,j-3) * (-0.005555555555555556d0) & + + in(i+3,j-3) * (-0.005555555555555556d0) & + + in(i-2,j-2) * (-0.041666666666666664d0) & + + in(i+1,j-2) * (-0.013888888888888888d0) & + + in(i+2,j-2) * (-0.013888888888888888d0) & + + in(i+3,j-2) * (-0.013888888888888888d0) & + + in(i-1,j-1) * (-0.08333333333333333d0) & + + in(i+1,j-1) * (-0.08333333333333333d0) & + + in(i+2,j-1) * (-0.08333333333333333d0) & + + in(i+3,j-1) * (-0.08333333333333333d0) & + + in(i-3,j+1) * (-0.005555555555555556d0) & + + in(i-2,j+1) * (-0.013888888888888888d0) & + + in(i-1,j+1) * (-0.08333333333333333d0) & + + in(i+1,j+1) * (0.08333333333333333d0) & + + in(i+2,j+1) * (0.013888888888888888d0) & + + in(i+3,j+1) * (0.005555555555555556d0) & + + in(i-3,j+2) * (-0.005555555555555556d0) & + + in(i-2,j+2) * (-0.013888888888888888d0) & + + in(i-1,j+2) * (-0.08333333333333333d0) & + + in(i+1,j+2) * (0.013888888888888888d0) & + + in(i+2,j+2) * (0.041666666666666664d0) & + + in(i+3,j+2) * (0.005555555555555556d0) & + + in(i-3,j+3) * (-0.005555555555555556d0) & + + in(i-2,j+3) * (-0.013888888888888888d0) & + + in(i-1,j+3) * (-0.08333333333333333d0) & + + in(i+1,j+3) * (0.005555555555555556d0) & + + in(i+2,j+3) * (0.005555555555555556d0) & + + in(i+3,j+3) * (0.027777777777777776d0) & +0.0 end do end do @@ -416,58 +416,58 @@ subroutine grid4(n, in, out) do i=4,n-4-1 do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i-4,j-4) * (-0.015625) & - + in(i+1,j-4) * (-0.002232142857142857) & - + in(i+2,j-4) * (-0.002232142857142857) & - + in(i+3,j-4) * (-0.002232142857142857) & - + in(i+4,j-4) * (-0.002232142857142857) & - + in(i-3,j-3) * (-0.020833333333333332) & - + in(i+1,j-3) * (-0.004166666666666667) & - + in(i+2,j-3) * (-0.004166666666666667) & - + in(i+3,j-3) * (-0.004166666666666667) & - + in(i+4,j-3) * (-0.004166666666666667) & - + in(i-2,j-2) * (-0.03125) & - + in(i+1,j-2) * (-0.010416666666666666) & - + in(i+2,j-2) * (-0.010416666666666666) & - + in(i+3,j-2) * (-0.010416666666666666) & - + in(i+4,j-2) * (-0.010416666666666666) & - + in(i-1,j-1) * (-0.0625) & - + in(i+1,j-1) * (-0.0625) & - + in(i+2,j-1) * (-0.0625) & - + in(i+3,j-1) * (-0.0625) & - + in(i+4,j-1) * (-0.0625) & - + in(i-4,j+1) * (-0.002232142857142857) & - + in(i-3,j+1) * (-0.004166666666666667) & - + in(i-2,j+1) * (-0.010416666666666666) & - + in(i-1,j+1) * (-0.0625) & - + in(i+1,j+1) * (0.0625) & - + in(i+2,j+1) * (0.010416666666666666) & - + in(i+3,j+1) * (0.004166666666666667) & - + in(i+4,j+1) * (0.002232142857142857) & - + in(i-4,j+2) * (-0.002232142857142857) & - + in(i-3,j+2) * (-0.004166666666666667) & - + in(i-2,j+2) * (-0.010416666666666666) & - + in(i-1,j+2) * (-0.0625) & - + in(i+1,j+2) * (0.010416666666666666) & - + in(i+2,j+2) * (0.03125) & - + in(i+3,j+2) * (0.004166666666666667) & - + in(i+4,j+2) * (0.002232142857142857) & - + in(i-4,j+3) * (-0.002232142857142857) & - + in(i-3,j+3) * (-0.004166666666666667) & - + in(i-2,j+3) * (-0.010416666666666666) & - + in(i-1,j+3) * (-0.0625) & - + in(i+1,j+3) * (0.004166666666666667) & - + in(i+2,j+3) * (0.004166666666666667) & - + in(i+3,j+3) * (0.020833333333333332) & - + in(i+4,j+3) * (0.002232142857142857) & - + in(i-4,j+4) * (-0.002232142857142857) & - + in(i-3,j+4) * (-0.004166666666666667) & - + in(i-2,j+4) * (-0.010416666666666666) & - + in(i-1,j+4) * (-0.0625) & - + in(i+1,j+4) * (0.002232142857142857) & - + in(i+2,j+4) * (0.002232142857142857) & - + in(i+3,j+4) * (0.002232142857142857) & - + in(i+4,j+4) * (0.015625) & + + in(i-4,j-4) * (-0.015625d0) & + + in(i+1,j-4) * (-0.002232142857142857d0) & + + in(i+2,j-4) * (-0.002232142857142857d0) & + + in(i+3,j-4) * (-0.002232142857142857d0) & + + in(i+4,j-4) * (-0.002232142857142857d0) & + + in(i-3,j-3) * (-0.020833333333333332d0) & + + in(i+1,j-3) * (-0.004166666666666667d0) & + + in(i+2,j-3) * (-0.004166666666666667d0) & + + in(i+3,j-3) * (-0.004166666666666667d0) & + + in(i+4,j-3) * (-0.004166666666666667d0) & + + in(i-2,j-2) * (-0.03125d0) & + + in(i+1,j-2) * (-0.010416666666666666d0) & + + in(i+2,j-2) * (-0.010416666666666666d0) & + + in(i+3,j-2) * (-0.010416666666666666d0) & + + in(i+4,j-2) * (-0.010416666666666666d0) & + + in(i-1,j-1) * (-0.0625d0) & + + in(i+1,j-1) * (-0.0625d0) & + + in(i+2,j-1) * (-0.0625d0) & + + in(i+3,j-1) * (-0.0625d0) & + + in(i+4,j-1) * (-0.0625d0) & + + in(i-4,j+1) * (-0.002232142857142857d0) & + + in(i-3,j+1) * (-0.004166666666666667d0) & + + in(i-2,j+1) * (-0.010416666666666666d0) & + + in(i-1,j+1) * (-0.0625d0) & + + in(i+1,j+1) * (0.0625d0) & + + in(i+2,j+1) * (0.010416666666666666d0) & + + in(i+3,j+1) * (0.004166666666666667d0) & + + in(i+4,j+1) * (0.002232142857142857d0) & + + in(i-4,j+2) * (-0.002232142857142857d0) & + + in(i-3,j+2) * (-0.004166666666666667d0) & + + in(i-2,j+2) * (-0.010416666666666666d0) & + + in(i-1,j+2) * (-0.0625d0) & + + in(i+1,j+2) * (0.010416666666666666d0) & + + in(i+2,j+2) * (0.03125d0) & + + in(i+3,j+2) * (0.004166666666666667d0) & + + in(i+4,j+2) * (0.002232142857142857d0) & + + in(i-4,j+3) * (-0.002232142857142857d0) & + + in(i-3,j+3) * (-0.004166666666666667d0) & + + in(i-2,j+3) * (-0.010416666666666666d0) & + + in(i-1,j+3) * (-0.0625d0) & + + in(i+1,j+3) * (0.004166666666666667d0) & + + in(i+2,j+3) * (0.004166666666666667d0) & + + in(i+3,j+3) * (0.020833333333333332d0) & + + in(i+4,j+3) * (0.002232142857142857d0) & + + in(i-4,j+4) * (-0.002232142857142857d0) & + + in(i-3,j+4) * (-0.004166666666666667d0) & + + in(i-2,j+4) * (-0.010416666666666666d0) & + + in(i-1,j+4) * (-0.0625d0) & + + in(i+1,j+4) * (0.002232142857142857d0) & + + in(i+2,j+4) * (0.002232142857142857d0) & + + in(i+3,j+4) * (0.002232142857142857d0) & + + in(i+4,j+4) * (0.015625d0) & +0.0 end do end do @@ -483,86 +483,86 @@ subroutine grid5(n, in, out) do i=5,n-5-1 do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i-5,j-5) * (-0.01) & - + in(i+1,j-5) * (-0.0011111111111111111) & - + in(i+2,j-5) * (-0.0011111111111111111) & - + in(i+3,j-5) * (-0.0011111111111111111) & - + in(i+4,j-5) * (-0.0011111111111111111) & - + in(i+5,j-5) * (-0.0011111111111111111) & - + in(i-4,j-4) * (-0.0125) & - + in(i+1,j-4) * (-0.0017857142857142857) & - + in(i+2,j-4) * (-0.0017857142857142857) & - + in(i+3,j-4) * (-0.0017857142857142857) & - + in(i+4,j-4) * (-0.0017857142857142857) & - + in(i+5,j-4) * (-0.0017857142857142857) & - + in(i-3,j-3) * (-0.016666666666666666) & - + in(i+1,j-3) * (-0.0033333333333333335) & - + in(i+2,j-3) * (-0.0033333333333333335) & - + in(i+3,j-3) * (-0.0033333333333333335) & - + in(i+4,j-3) * (-0.0033333333333333335) & - + in(i+5,j-3) * (-0.0033333333333333335) & - + in(i-2,j-2) * (-0.025) & - + in(i+1,j-2) * (-0.008333333333333333) & - + in(i+2,j-2) * (-0.008333333333333333) & - + in(i+3,j-2) * (-0.008333333333333333) & - + in(i+4,j-2) * (-0.008333333333333333) & - + in(i+5,j-2) * (-0.008333333333333333) & - + in(i-1,j-1) * (-0.05) & - + in(i+1,j-1) * (-0.05) & - + in(i+2,j-1) * (-0.05) & - + in(i+3,j-1) * (-0.05) & - + in(i+4,j-1) * (-0.05) & - + in(i+5,j-1) * (-0.05) & - + in(i-5,j+1) * (-0.0011111111111111111) & - + in(i-4,j+1) * (-0.0017857142857142857) & - + in(i-3,j+1) * (-0.0033333333333333335) & - + in(i-2,j+1) * (-0.008333333333333333) & - + in(i-1,j+1) * (-0.05) & - + in(i+1,j+1) * (0.05) & - + in(i+2,j+1) * (0.008333333333333333) & - + in(i+3,j+1) * (0.0033333333333333335) & - + in(i+4,j+1) * (0.0017857142857142857) & - + in(i+5,j+1) * (0.0011111111111111111) & - + in(i-5,j+2) * (-0.0011111111111111111) & - + in(i-4,j+2) * (-0.0017857142857142857) & - + in(i-3,j+2) * (-0.0033333333333333335) & - + in(i-2,j+2) * (-0.008333333333333333) & - + in(i-1,j+2) * (-0.05) & - + in(i+1,j+2) * (0.008333333333333333) & - + in(i+2,j+2) * (0.025) & - + in(i+3,j+2) * (0.0033333333333333335) & - + in(i+4,j+2) * (0.0017857142857142857) & - + in(i+5,j+2) * (0.0011111111111111111) & - + in(i-5,j+3) * (-0.0011111111111111111) & - + in(i-4,j+3) * (-0.0017857142857142857) & - + in(i-3,j+3) * (-0.0033333333333333335) & - + in(i-2,j+3) * (-0.008333333333333333) & - + in(i-1,j+3) * (-0.05) & - + in(i+1,j+3) * (0.0033333333333333335) & - + in(i+2,j+3) * (0.0033333333333333335) & - + in(i+3,j+3) * (0.016666666666666666) & - + in(i+4,j+3) * (0.0017857142857142857) & - + in(i+5,j+3) * (0.0011111111111111111) & - + in(i-5,j+4) * (-0.0011111111111111111) & - + in(i-4,j+4) * (-0.0017857142857142857) & - + in(i-3,j+4) * (-0.0033333333333333335) & - + in(i-2,j+4) * (-0.008333333333333333) & - + in(i-1,j+4) * (-0.05) & - + in(i+1,j+4) * (0.0017857142857142857) & - + in(i+2,j+4) * (0.0017857142857142857) & - + in(i+3,j+4) * (0.0017857142857142857) & - + in(i+4,j+4) * (0.0125) & - + in(i+5,j+4) * (0.0011111111111111111) & - + in(i-5,j+5) * (-0.0011111111111111111) & - + in(i-4,j+5) * (-0.0017857142857142857) & - + in(i-3,j+5) * (-0.0033333333333333335) & - + in(i-2,j+5) * (-0.008333333333333333) & - + in(i-1,j+5) * (-0.05) & - + in(i+1,j+5) * (0.0011111111111111111) & - + in(i+2,j+5) * (0.0011111111111111111) & - + in(i+3,j+5) * (0.0011111111111111111) & - + in(i+4,j+5) * (0.0011111111111111111) & - + in(i+5,j+5) * (0.01) & + + in(i-5,j-5) * (-0.01d0) & + + in(i+1,j-5) * (-0.0011111111111111111d0) & + + in(i+2,j-5) * (-0.0011111111111111111d0) & + + in(i+3,j-5) * (-0.0011111111111111111d0) & + + in(i+4,j-5) * (-0.0011111111111111111d0) & + + in(i+5,j-5) * (-0.0011111111111111111d0) & + + in(i-4,j-4) * (-0.0125d0) & + + in(i+1,j-4) * (-0.0017857142857142857d0) & + + in(i+2,j-4) * (-0.0017857142857142857d0) & + + in(i+3,j-4) * (-0.0017857142857142857d0) & + + in(i+4,j-4) * (-0.0017857142857142857d0) & + + in(i+5,j-4) * (-0.0017857142857142857d0) & + + in(i-3,j-3) * (-0.016666666666666666d0) & + + in(i+1,j-3) * (-0.0033333333333333335d0) & + + in(i+2,j-3) * (-0.0033333333333333335d0) & + + in(i+3,j-3) * (-0.0033333333333333335d0) & + + in(i+4,j-3) * (-0.0033333333333333335d0) & + + in(i+5,j-3) * (-0.0033333333333333335d0) & + + in(i-2,j-2) * (-0.025d0) & + + in(i+1,j-2) * (-0.008333333333333333d0) & + + in(i+2,j-2) * (-0.008333333333333333d0) & + + in(i+3,j-2) * (-0.008333333333333333d0) & + + in(i+4,j-2) * (-0.008333333333333333d0) & + + in(i+5,j-2) * (-0.008333333333333333d0) & + + in(i-1,j-1) * (-0.05d0) & + + in(i+1,j-1) * (-0.05d0) & + + in(i+2,j-1) * (-0.05d0) & + + in(i+3,j-1) * (-0.05d0) & + + in(i+4,j-1) * (-0.05d0) & + + in(i+5,j-1) * (-0.05d0) & + + in(i-5,j+1) * (-0.0011111111111111111d0) & + + in(i-4,j+1) * (-0.0017857142857142857d0) & + + in(i-3,j+1) * (-0.0033333333333333335d0) & + + in(i-2,j+1) * (-0.008333333333333333d0) & + + in(i-1,j+1) * (-0.05d0) & + + in(i+1,j+1) * (0.05d0) & + + in(i+2,j+1) * (0.008333333333333333d0) & + + in(i+3,j+1) * (0.0033333333333333335d0) & + + in(i+4,j+1) * (0.0017857142857142857d0) & + + in(i+5,j+1) * (0.0011111111111111111d0) & + + in(i-5,j+2) * (-0.0011111111111111111d0) & + + in(i-4,j+2) * (-0.0017857142857142857d0) & + + in(i-3,j+2) * (-0.0033333333333333335d0) & + + in(i-2,j+2) * (-0.008333333333333333d0) & + + in(i-1,j+2) * (-0.05d0) & + + in(i+1,j+2) * (0.008333333333333333d0) & + + in(i+2,j+2) * (0.025d0) & + + in(i+3,j+2) * (0.0033333333333333335d0) & + + in(i+4,j+2) * (0.0017857142857142857d0) & + + in(i+5,j+2) * (0.0011111111111111111d0) & + + in(i-5,j+3) * (-0.0011111111111111111d0) & + + in(i-4,j+3) * (-0.0017857142857142857d0) & + + in(i-3,j+3) * (-0.0033333333333333335d0) & + + in(i-2,j+3) * (-0.008333333333333333d0) & + + in(i-1,j+3) * (-0.05d0) & + + in(i+1,j+3) * (0.0033333333333333335d0) & + + in(i+2,j+3) * (0.0033333333333333335d0) & + + in(i+3,j+3) * (0.016666666666666666d0) & + + in(i+4,j+3) * (0.0017857142857142857d0) & + + in(i+5,j+3) * (0.0011111111111111111d0) & + + in(i-5,j+4) * (-0.0011111111111111111d0) & + + in(i-4,j+4) * (-0.0017857142857142857d0) & + + in(i-3,j+4) * (-0.0033333333333333335d0) & + + in(i-2,j+4) * (-0.008333333333333333d0) & + + in(i-1,j+4) * (-0.05d0) & + + in(i+1,j+4) * (0.0017857142857142857d0) & + + in(i+2,j+4) * (0.0017857142857142857d0) & + + in(i+3,j+4) * (0.0017857142857142857d0) & + + in(i+4,j+4) * (0.0125d0) & + + in(i+5,j+4) * (0.0011111111111111111d0) & + + in(i-5,j+5) * (-0.0011111111111111111d0) & + + in(i-4,j+5) * (-0.0017857142857142857d0) & + + in(i-3,j+5) * (-0.0033333333333333335d0) & + + in(i-2,j+5) * (-0.008333333333333333d0) & + + in(i-1,j+5) * (-0.05d0) & + + in(i+1,j+5) * (0.0011111111111111111d0) & + + in(i+2,j+5) * (0.0011111111111111111d0) & + + in(i+3,j+5) * (0.0011111111111111111d0) & + + in(i+4,j+5) * (0.0011111111111111111d0) & + + in(i+5,j+5) * (0.01d0) & +0.0 end do end do @@ -578,120 +578,120 @@ subroutine grid6(n, in, out) do i=6,n-6-1 do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i-6,j-6) * (-0.006944444444444444) & - + in(i+1,j-6) * (-0.0006313131313131314) & - + in(i+2,j-6) * (-0.0006313131313131314) & - + in(i+3,j-6) * (-0.0006313131313131314) & - + in(i+4,j-6) * (-0.0006313131313131314) & - + in(i+5,j-6) * (-0.0006313131313131314) & - + in(i+6,j-6) * (-0.0006313131313131314) & - + in(i-5,j-5) * (-0.008333333333333333) & - + in(i+1,j-5) * (-0.000925925925925926) & - + in(i+2,j-5) * (-0.000925925925925926) & - + in(i+3,j-5) * (-0.000925925925925926) & - + in(i+4,j-5) * (-0.000925925925925926) & - + in(i+5,j-5) * (-0.000925925925925926) & - + in(i+6,j-5) * (-0.000925925925925926) & - + in(i-4,j-4) * (-0.010416666666666666) & - + in(i+1,j-4) * (-0.001488095238095238) & - + in(i+2,j-4) * (-0.001488095238095238) & - + in(i+3,j-4) * (-0.001488095238095238) & - + in(i+4,j-4) * (-0.001488095238095238) & - + in(i+5,j-4) * (-0.001488095238095238) & - + in(i+6,j-4) * (-0.001488095238095238) & - + in(i-3,j-3) * (-0.013888888888888888) & - + in(i+1,j-3) * (-0.002777777777777778) & - + in(i+2,j-3) * (-0.002777777777777778) & - + in(i+3,j-3) * (-0.002777777777777778) & - + in(i+4,j-3) * (-0.002777777777777778) & - + in(i+5,j-3) * (-0.002777777777777778) & - + in(i+6,j-3) * (-0.002777777777777778) & - + in(i-2,j-2) * (-0.020833333333333332) & - + in(i+1,j-2) * (-0.006944444444444444) & - + in(i+2,j-2) * (-0.006944444444444444) & - + in(i+3,j-2) * (-0.006944444444444444) & - + in(i+4,j-2) * (-0.006944444444444444) & - + in(i+5,j-2) * (-0.006944444444444444) & - + in(i+6,j-2) * (-0.006944444444444444) & - + in(i-1,j-1) * (-0.041666666666666664) & - + in(i+1,j-1) * (-0.041666666666666664) & - + in(i+2,j-1) * (-0.041666666666666664) & - + in(i+3,j-1) * (-0.041666666666666664) & - + in(i+4,j-1) * (-0.041666666666666664) & - + in(i+5,j-1) * (-0.041666666666666664) & - + in(i+6,j-1) * (-0.041666666666666664) & - + in(i-6,j+1) * (-0.0006313131313131314) & - + in(i-5,j+1) * (-0.000925925925925926) & - + in(i-4,j+1) * (-0.001488095238095238) & - + in(i-3,j+1) * (-0.002777777777777778) & - + in(i-2,j+1) * (-0.006944444444444444) & - + in(i-1,j+1) * (-0.041666666666666664) & - + in(i+1,j+1) * (0.041666666666666664) & - + in(i+2,j+1) * (0.006944444444444444) & - + in(i+3,j+1) * (0.002777777777777778) & - + in(i+4,j+1) * (0.001488095238095238) & - + in(i+5,j+1) * (0.000925925925925926) & - + in(i+6,j+1) * (0.0006313131313131314) & - + in(i-6,j+2) * (-0.0006313131313131314) & - + in(i-5,j+2) * (-0.000925925925925926) & - + in(i-4,j+2) * (-0.001488095238095238) & - + in(i-3,j+2) * (-0.002777777777777778) & - + in(i-2,j+2) * (-0.006944444444444444) & - + in(i-1,j+2) * (-0.041666666666666664) & - + in(i+1,j+2) * (0.006944444444444444) & - + in(i+2,j+2) * (0.020833333333333332) & - + in(i+3,j+2) * (0.002777777777777778) & - + in(i+4,j+2) * (0.001488095238095238) & - + in(i+5,j+2) * (0.000925925925925926) & - + in(i+6,j+2) * (0.0006313131313131314) & - + in(i-6,j+3) * (-0.0006313131313131314) & - + in(i-5,j+3) * (-0.000925925925925926) & - + in(i-4,j+3) * (-0.001488095238095238) & - + in(i-3,j+3) * (-0.002777777777777778) & - + in(i-2,j+3) * (-0.006944444444444444) & - + in(i-1,j+3) * (-0.041666666666666664) & - + in(i+1,j+3) * (0.002777777777777778) & - + in(i+2,j+3) * (0.002777777777777778) & - + in(i+3,j+3) * (0.013888888888888888) & - + in(i+4,j+3) * (0.001488095238095238) & - + in(i+5,j+3) * (0.000925925925925926) & - + in(i+6,j+3) * (0.0006313131313131314) & - + in(i-6,j+4) * (-0.0006313131313131314) & - + in(i-5,j+4) * (-0.000925925925925926) & - + in(i-4,j+4) * (-0.001488095238095238) & - + in(i-3,j+4) * (-0.002777777777777778) & - + in(i-2,j+4) * (-0.006944444444444444) & - + in(i-1,j+4) * (-0.041666666666666664) & - + in(i+1,j+4) * (0.001488095238095238) & - + in(i+2,j+4) * (0.001488095238095238) & - + in(i+3,j+4) * (0.001488095238095238) & - + in(i+4,j+4) * (0.010416666666666666) & - + in(i+5,j+4) * (0.000925925925925926) & - + in(i+6,j+4) * (0.0006313131313131314) & - + in(i-6,j+5) * (-0.0006313131313131314) & - + in(i-5,j+5) * (-0.000925925925925926) & - + in(i-4,j+5) * (-0.001488095238095238) & - + in(i-3,j+5) * (-0.002777777777777778) & - + in(i-2,j+5) * (-0.006944444444444444) & - + in(i-1,j+5) * (-0.041666666666666664) & - + in(i+1,j+5) * (0.000925925925925926) & - + in(i+2,j+5) * (0.000925925925925926) & - + in(i+3,j+5) * (0.000925925925925926) & - + in(i+4,j+5) * (0.000925925925925926) & - + in(i+5,j+5) * (0.008333333333333333) & - + in(i+6,j+5) * (0.0006313131313131314) & - + in(i-6,j+6) * (-0.0006313131313131314) & - + in(i-5,j+6) * (-0.000925925925925926) & - + in(i-4,j+6) * (-0.001488095238095238) & - + in(i-3,j+6) * (-0.002777777777777778) & - + in(i-2,j+6) * (-0.006944444444444444) & - + in(i-1,j+6) * (-0.041666666666666664) & - + in(i+1,j+6) * (0.0006313131313131314) & - + in(i+2,j+6) * (0.0006313131313131314) & - + in(i+3,j+6) * (0.0006313131313131314) & - + in(i+4,j+6) * (0.0006313131313131314) & - + in(i+5,j+6) * (0.0006313131313131314) & - + in(i+6,j+6) * (0.006944444444444444) & + + in(i-6,j-6) * (-0.006944444444444444d0) & + + in(i+1,j-6) * (-0.0006313131313131314d0) & + + in(i+2,j-6) * (-0.0006313131313131314d0) & + + in(i+3,j-6) * (-0.0006313131313131314d0) & + + in(i+4,j-6) * (-0.0006313131313131314d0) & + + in(i+5,j-6) * (-0.0006313131313131314d0) & + + in(i+6,j-6) * (-0.0006313131313131314d0) & + + in(i-5,j-5) * (-0.008333333333333333d0) & + + in(i+1,j-5) * (-0.000925925925925926d0) & + + in(i+2,j-5) * (-0.000925925925925926d0) & + + in(i+3,j-5) * (-0.000925925925925926d0) & + + in(i+4,j-5) * (-0.000925925925925926d0) & + + in(i+5,j-5) * (-0.000925925925925926d0) & + + in(i+6,j-5) * (-0.000925925925925926d0) & + + in(i-4,j-4) * (-0.010416666666666666d0) & + + in(i+1,j-4) * (-0.001488095238095238d0) & + + in(i+2,j-4) * (-0.001488095238095238d0) & + + in(i+3,j-4) * (-0.001488095238095238d0) & + + in(i+4,j-4) * (-0.001488095238095238d0) & + + in(i+5,j-4) * (-0.001488095238095238d0) & + + in(i+6,j-4) * (-0.001488095238095238d0) & + + in(i-3,j-3) * (-0.013888888888888888d0) & + + in(i+1,j-3) * (-0.002777777777777778d0) & + + in(i+2,j-3) * (-0.002777777777777778d0) & + + in(i+3,j-3) * (-0.002777777777777778d0) & + + in(i+4,j-3) * (-0.002777777777777778d0) & + + in(i+5,j-3) * (-0.002777777777777778d0) & + + in(i+6,j-3) * (-0.002777777777777778d0) & + + in(i-2,j-2) * (-0.020833333333333332d0) & + + in(i+1,j-2) * (-0.006944444444444444d0) & + + in(i+2,j-2) * (-0.006944444444444444d0) & + + in(i+3,j-2) * (-0.006944444444444444d0) & + + in(i+4,j-2) * (-0.006944444444444444d0) & + + in(i+5,j-2) * (-0.006944444444444444d0) & + + in(i+6,j-2) * (-0.006944444444444444d0) & + + in(i-1,j-1) * (-0.041666666666666664d0) & + + in(i+1,j-1) * (-0.041666666666666664d0) & + + in(i+2,j-1) * (-0.041666666666666664d0) & + + in(i+3,j-1) * (-0.041666666666666664d0) & + + in(i+4,j-1) * (-0.041666666666666664d0) & + + in(i+5,j-1) * (-0.041666666666666664d0) & + + in(i+6,j-1) * (-0.041666666666666664d0) & + + in(i-6,j+1) * (-0.0006313131313131314d0) & + + in(i-5,j+1) * (-0.000925925925925926d0) & + + in(i-4,j+1) * (-0.001488095238095238d0) & + + in(i-3,j+1) * (-0.002777777777777778d0) & + + in(i-2,j+1) * (-0.006944444444444444d0) & + + in(i-1,j+1) * (-0.041666666666666664d0) & + + in(i+1,j+1) * (0.041666666666666664d0) & + + in(i+2,j+1) * (0.006944444444444444d0) & + + in(i+3,j+1) * (0.002777777777777778d0) & + + in(i+4,j+1) * (0.001488095238095238d0) & + + in(i+5,j+1) * (0.000925925925925926d0) & + + in(i+6,j+1) * (0.0006313131313131314d0) & + + in(i-6,j+2) * (-0.0006313131313131314d0) & + + in(i-5,j+2) * (-0.000925925925925926d0) & + + in(i-4,j+2) * (-0.001488095238095238d0) & + + in(i-3,j+2) * (-0.002777777777777778d0) & + + in(i-2,j+2) * (-0.006944444444444444d0) & + + in(i-1,j+2) * (-0.041666666666666664d0) & + + in(i+1,j+2) * (0.006944444444444444d0) & + + in(i+2,j+2) * (0.020833333333333332d0) & + + in(i+3,j+2) * (0.002777777777777778d0) & + + in(i+4,j+2) * (0.001488095238095238d0) & + + in(i+5,j+2) * (0.000925925925925926d0) & + + in(i+6,j+2) * (0.0006313131313131314d0) & + + in(i-6,j+3) * (-0.0006313131313131314d0) & + + in(i-5,j+3) * (-0.000925925925925926d0) & + + in(i-4,j+3) * (-0.001488095238095238d0) & + + in(i-3,j+3) * (-0.002777777777777778d0) & + + in(i-2,j+3) * (-0.006944444444444444d0) & + + in(i-1,j+3) * (-0.041666666666666664d0) & + + in(i+1,j+3) * (0.002777777777777778d0) & + + in(i+2,j+3) * (0.002777777777777778d0) & + + in(i+3,j+3) * (0.013888888888888888d0) & + + in(i+4,j+3) * (0.001488095238095238d0) & + + in(i+5,j+3) * (0.000925925925925926d0) & + + in(i+6,j+3) * (0.0006313131313131314d0) & + + in(i-6,j+4) * (-0.0006313131313131314d0) & + + in(i-5,j+4) * (-0.000925925925925926d0) & + + in(i-4,j+4) * (-0.001488095238095238d0) & + + in(i-3,j+4) * (-0.002777777777777778d0) & + + in(i-2,j+4) * (-0.006944444444444444d0) & + + in(i-1,j+4) * (-0.041666666666666664d0) & + + in(i+1,j+4) * (0.001488095238095238d0) & + + in(i+2,j+4) * (0.001488095238095238d0) & + + in(i+3,j+4) * (0.001488095238095238d0) & + + in(i+4,j+4) * (0.010416666666666666d0) & + + in(i+5,j+4) * (0.000925925925925926d0) & + + in(i+6,j+4) * (0.0006313131313131314d0) & + + in(i-6,j+5) * (-0.0006313131313131314d0) & + + in(i-5,j+5) * (-0.000925925925925926d0) & + + in(i-4,j+5) * (-0.001488095238095238d0) & + + in(i-3,j+5) * (-0.002777777777777778d0) & + + in(i-2,j+5) * (-0.006944444444444444d0) & + + in(i-1,j+5) * (-0.041666666666666664d0) & + + in(i+1,j+5) * (0.000925925925925926d0) & + + in(i+2,j+5) * (0.000925925925925926d0) & + + in(i+3,j+5) * (0.000925925925925926d0) & + + in(i+4,j+5) * (0.000925925925925926d0) & + + in(i+5,j+5) * (0.008333333333333333d0) & + + in(i+6,j+5) * (0.0006313131313131314d0) & + + in(i-6,j+6) * (-0.0006313131313131314d0) & + + in(i-5,j+6) * (-0.000925925925925926d0) & + + in(i-4,j+6) * (-0.001488095238095238d0) & + + in(i-3,j+6) * (-0.002777777777777778d0) & + + in(i-2,j+6) * (-0.006944444444444444d0) & + + in(i-1,j+6) * (-0.041666666666666664d0) & + + in(i+1,j+6) * (0.0006313131313131314d0) & + + in(i+2,j+6) * (0.0006313131313131314d0) & + + in(i+3,j+6) * (0.0006313131313131314d0) & + + in(i+4,j+6) * (0.0006313131313131314d0) & + + in(i+5,j+6) * (0.0006313131313131314d0) & + + in(i+6,j+6) * (0.006944444444444444d0) & +0.0 end do end do @@ -707,160 +707,160 @@ subroutine grid7(n, in, out) do i=7,n-7-1 do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i-7,j-7) * (-0.00510204081632653) & - + in(i+1,j-7) * (-0.0003924646781789639) & - + in(i+2,j-7) * (-0.0003924646781789639) & - + in(i+3,j-7) * (-0.0003924646781789639) & - + in(i+4,j-7) * (-0.0003924646781789639) & - + in(i+5,j-7) * (-0.0003924646781789639) & - + in(i+6,j-7) * (-0.0003924646781789639) & - + in(i+7,j-7) * (-0.0003924646781789639) & - + in(i-6,j-6) * (-0.005952380952380952) & - + in(i+1,j-6) * (-0.0005411255411255411) & - + in(i+2,j-6) * (-0.0005411255411255411) & - + in(i+3,j-6) * (-0.0005411255411255411) & - + in(i+4,j-6) * (-0.0005411255411255411) & - + in(i+5,j-6) * (-0.0005411255411255411) & - + in(i+6,j-6) * (-0.0005411255411255411) & - + in(i+7,j-6) * (-0.0005411255411255411) & - + in(i-5,j-5) * (-0.007142857142857143) & - + in(i+1,j-5) * (-0.0007936507936507937) & - + in(i+2,j-5) * (-0.0007936507936507937) & - + in(i+3,j-5) * (-0.0007936507936507937) & - + in(i+4,j-5) * (-0.0007936507936507937) & - + in(i+5,j-5) * (-0.0007936507936507937) & - + in(i+6,j-5) * (-0.0007936507936507937) & - + in(i+7,j-5) * (-0.0007936507936507937) & - + in(i-4,j-4) * (-0.008928571428571428) & - + in(i+1,j-4) * (-0.0012755102040816326) & - + in(i+2,j-4) * (-0.0012755102040816326) & - + in(i+3,j-4) * (-0.0012755102040816326) & - + in(i+4,j-4) * (-0.0012755102040816326) & - + in(i+5,j-4) * (-0.0012755102040816326) & - + in(i+6,j-4) * (-0.0012755102040816326) & - + in(i+7,j-4) * (-0.0012755102040816326) & - + in(i-3,j-3) * (-0.011904761904761904) & - + in(i+1,j-3) * (-0.002380952380952381) & - + in(i+2,j-3) * (-0.002380952380952381) & - + in(i+3,j-3) * (-0.002380952380952381) & - + in(i+4,j-3) * (-0.002380952380952381) & - + in(i+5,j-3) * (-0.002380952380952381) & - + in(i+6,j-3) * (-0.002380952380952381) & - + in(i+7,j-3) * (-0.002380952380952381) & - + in(i-2,j-2) * (-0.017857142857142856) & - + in(i+1,j-2) * (-0.005952380952380952) & - + in(i+2,j-2) * (-0.005952380952380952) & - + in(i+3,j-2) * (-0.005952380952380952) & - + in(i+4,j-2) * (-0.005952380952380952) & - + in(i+5,j-2) * (-0.005952380952380952) & - + in(i+6,j-2) * (-0.005952380952380952) & - + in(i+7,j-2) * (-0.005952380952380952) & - + in(i-1,j-1) * (-0.03571428571428571) & - + in(i+1,j-1) * (-0.03571428571428571) & - + in(i+2,j-1) * (-0.03571428571428571) & - + in(i+3,j-1) * (-0.03571428571428571) & - + in(i+4,j-1) * (-0.03571428571428571) & - + in(i+5,j-1) * (-0.03571428571428571) & - + in(i+6,j-1) * (-0.03571428571428571) & - + in(i+7,j-1) * (-0.03571428571428571) & - + in(i-7,j+1) * (-0.0003924646781789639) & - + in(i-6,j+1) * (-0.0005411255411255411) & - + in(i-5,j+1) * (-0.0007936507936507937) & - + in(i-4,j+1) * (-0.0012755102040816326) & - + in(i-3,j+1) * (-0.002380952380952381) & - + in(i-2,j+1) * (-0.005952380952380952) & - + in(i-1,j+1) * (-0.03571428571428571) & - + in(i+1,j+1) * (0.03571428571428571) & - + in(i+2,j+1) * (0.005952380952380952) & - + in(i+3,j+1) * (0.002380952380952381) & - + in(i+4,j+1) * (0.0012755102040816326) & - + in(i+5,j+1) * (0.0007936507936507937) & - + in(i+6,j+1) * (0.0005411255411255411) & - + in(i+7,j+1) * (0.0003924646781789639) & - + in(i-7,j+2) * (-0.0003924646781789639) & - + in(i-6,j+2) * (-0.0005411255411255411) & - + in(i-5,j+2) * (-0.0007936507936507937) & - + in(i-4,j+2) * (-0.0012755102040816326) & - + in(i-3,j+2) * (-0.002380952380952381) & - + in(i-2,j+2) * (-0.005952380952380952) & - + in(i-1,j+2) * (-0.03571428571428571) & - + in(i+1,j+2) * (0.005952380952380952) & - + in(i+2,j+2) * (0.017857142857142856) & - + in(i+3,j+2) * (0.002380952380952381) & - + in(i+4,j+2) * (0.0012755102040816326) & - + in(i+5,j+2) * (0.0007936507936507937) & - + in(i+6,j+2) * (0.0005411255411255411) & - + in(i+7,j+2) * (0.0003924646781789639) & - + in(i-7,j+3) * (-0.0003924646781789639) & - + in(i-6,j+3) * (-0.0005411255411255411) & - + in(i-5,j+3) * (-0.0007936507936507937) & - + in(i-4,j+3) * (-0.0012755102040816326) & - + in(i-3,j+3) * (-0.002380952380952381) & - + in(i-2,j+3) * (-0.005952380952380952) & - + in(i-1,j+3) * (-0.03571428571428571) & - + in(i+1,j+3) * (0.002380952380952381) & - + in(i+2,j+3) * (0.002380952380952381) & - + in(i+3,j+3) * (0.011904761904761904) & - + in(i+4,j+3) * (0.0012755102040816326) & - + in(i+5,j+3) * (0.0007936507936507937) & - + in(i+6,j+3) * (0.0005411255411255411) & - + in(i+7,j+3) * (0.0003924646781789639) & - + in(i-7,j+4) * (-0.0003924646781789639) & - + in(i-6,j+4) * (-0.0005411255411255411) & - + in(i-5,j+4) * (-0.0007936507936507937) & - + in(i-4,j+4) * (-0.0012755102040816326) & - + in(i-3,j+4) * (-0.002380952380952381) & - + in(i-2,j+4) * (-0.005952380952380952) & - + in(i-1,j+4) * (-0.03571428571428571) & - + in(i+1,j+4) * (0.0012755102040816326) & - + in(i+2,j+4) * (0.0012755102040816326) & - + in(i+3,j+4) * (0.0012755102040816326) & - + in(i+4,j+4) * (0.008928571428571428) & - + in(i+5,j+4) * (0.0007936507936507937) & - + in(i+6,j+4) * (0.0005411255411255411) & - + in(i+7,j+4) * (0.0003924646781789639) & - + in(i-7,j+5) * (-0.0003924646781789639) & - + in(i-6,j+5) * (-0.0005411255411255411) & - + in(i-5,j+5) * (-0.0007936507936507937) & - + in(i-4,j+5) * (-0.0012755102040816326) & - + in(i-3,j+5) * (-0.002380952380952381) & - + in(i-2,j+5) * (-0.005952380952380952) & - + in(i-1,j+5) * (-0.03571428571428571) & - + in(i+1,j+5) * (0.0007936507936507937) & - + in(i+2,j+5) * (0.0007936507936507937) & - + in(i+3,j+5) * (0.0007936507936507937) & - + in(i+4,j+5) * (0.0007936507936507937) & - + in(i+5,j+5) * (0.007142857142857143) & - + in(i+6,j+5) * (0.0005411255411255411) & - + in(i+7,j+5) * (0.0003924646781789639) & - + in(i-7,j+6) * (-0.0003924646781789639) & - + in(i-6,j+6) * (-0.0005411255411255411) & - + in(i-5,j+6) * (-0.0007936507936507937) & - + in(i-4,j+6) * (-0.0012755102040816326) & - + in(i-3,j+6) * (-0.002380952380952381) & - + in(i-2,j+6) * (-0.005952380952380952) & - + in(i-1,j+6) * (-0.03571428571428571) & - + in(i+1,j+6) * (0.0005411255411255411) & - + in(i+2,j+6) * (0.0005411255411255411) & - + in(i+3,j+6) * (0.0005411255411255411) & - + in(i+4,j+6) * (0.0005411255411255411) & - + in(i+5,j+6) * (0.0005411255411255411) & - + in(i+6,j+6) * (0.005952380952380952) & - + in(i+7,j+6) * (0.0003924646781789639) & - + in(i-7,j+7) * (-0.0003924646781789639) & - + in(i-6,j+7) * (-0.0005411255411255411) & - + in(i-5,j+7) * (-0.0007936507936507937) & - + in(i-4,j+7) * (-0.0012755102040816326) & - + in(i-3,j+7) * (-0.002380952380952381) & - + in(i-2,j+7) * (-0.005952380952380952) & - + in(i-1,j+7) * (-0.03571428571428571) & - + in(i+1,j+7) * (0.0003924646781789639) & - + in(i+2,j+7) * (0.0003924646781789639) & - + in(i+3,j+7) * (0.0003924646781789639) & - + in(i+4,j+7) * (0.0003924646781789639) & - + in(i+5,j+7) * (0.0003924646781789639) & - + in(i+6,j+7) * (0.0003924646781789639) & - + in(i+7,j+7) * (0.00510204081632653) & + + in(i-7,j-7) * (-0.00510204081632653d0) & + + in(i+1,j-7) * (-0.0003924646781789639d0) & + + in(i+2,j-7) * (-0.0003924646781789639d0) & + + in(i+3,j-7) * (-0.0003924646781789639d0) & + + in(i+4,j-7) * (-0.0003924646781789639d0) & + + in(i+5,j-7) * (-0.0003924646781789639d0) & + + in(i+6,j-7) * (-0.0003924646781789639d0) & + + in(i+7,j-7) * (-0.0003924646781789639d0) & + + in(i-6,j-6) * (-0.005952380952380952d0) & + + in(i+1,j-6) * (-0.0005411255411255411d0) & + + in(i+2,j-6) * (-0.0005411255411255411d0) & + + in(i+3,j-6) * (-0.0005411255411255411d0) & + + in(i+4,j-6) * (-0.0005411255411255411d0) & + + in(i+5,j-6) * (-0.0005411255411255411d0) & + + in(i+6,j-6) * (-0.0005411255411255411d0) & + + in(i+7,j-6) * (-0.0005411255411255411d0) & + + in(i-5,j-5) * (-0.007142857142857143d0) & + + in(i+1,j-5) * (-0.0007936507936507937d0) & + + in(i+2,j-5) * (-0.0007936507936507937d0) & + + in(i+3,j-5) * (-0.0007936507936507937d0) & + + in(i+4,j-5) * (-0.0007936507936507937d0) & + + in(i+5,j-5) * (-0.0007936507936507937d0) & + + in(i+6,j-5) * (-0.0007936507936507937d0) & + + in(i+7,j-5) * (-0.0007936507936507937d0) & + + in(i-4,j-4) * (-0.008928571428571428d0) & + + in(i+1,j-4) * (-0.0012755102040816326d0) & + + in(i+2,j-4) * (-0.0012755102040816326d0) & + + in(i+3,j-4) * (-0.0012755102040816326d0) & + + in(i+4,j-4) * (-0.0012755102040816326d0) & + + in(i+5,j-4) * (-0.0012755102040816326d0) & + + in(i+6,j-4) * (-0.0012755102040816326d0) & + + in(i+7,j-4) * (-0.0012755102040816326d0) & + + in(i-3,j-3) * (-0.011904761904761904d0) & + + in(i+1,j-3) * (-0.002380952380952381d0) & + + in(i+2,j-3) * (-0.002380952380952381d0) & + + in(i+3,j-3) * (-0.002380952380952381d0) & + + in(i+4,j-3) * (-0.002380952380952381d0) & + + in(i+5,j-3) * (-0.002380952380952381d0) & + + in(i+6,j-3) * (-0.002380952380952381d0) & + + in(i+7,j-3) * (-0.002380952380952381d0) & + + in(i-2,j-2) * (-0.017857142857142856d0) & + + in(i+1,j-2) * (-0.005952380952380952d0) & + + in(i+2,j-2) * (-0.005952380952380952d0) & + + in(i+3,j-2) * (-0.005952380952380952d0) & + + in(i+4,j-2) * (-0.005952380952380952d0) & + + in(i+5,j-2) * (-0.005952380952380952d0) & + + in(i+6,j-2) * (-0.005952380952380952d0) & + + in(i+7,j-2) * (-0.005952380952380952d0) & + + in(i-1,j-1) * (-0.03571428571428571d0) & + + in(i+1,j-1) * (-0.03571428571428571d0) & + + in(i+2,j-1) * (-0.03571428571428571d0) & + + in(i+3,j-1) * (-0.03571428571428571d0) & + + in(i+4,j-1) * (-0.03571428571428571d0) & + + in(i+5,j-1) * (-0.03571428571428571d0) & + + in(i+6,j-1) * (-0.03571428571428571d0) & + + in(i+7,j-1) * (-0.03571428571428571d0) & + + in(i-7,j+1) * (-0.0003924646781789639d0) & + + in(i-6,j+1) * (-0.0005411255411255411d0) & + + in(i-5,j+1) * (-0.0007936507936507937d0) & + + in(i-4,j+1) * (-0.0012755102040816326d0) & + + in(i-3,j+1) * (-0.002380952380952381d0) & + + in(i-2,j+1) * (-0.005952380952380952d0) & + + in(i-1,j+1) * (-0.03571428571428571d0) & + + in(i+1,j+1) * (0.03571428571428571d0) & + + in(i+2,j+1) * (0.005952380952380952d0) & + + in(i+3,j+1) * (0.002380952380952381d0) & + + in(i+4,j+1) * (0.0012755102040816326d0) & + + in(i+5,j+1) * (0.0007936507936507937d0) & + + in(i+6,j+1) * (0.0005411255411255411d0) & + + in(i+7,j+1) * (0.0003924646781789639d0) & + + in(i-7,j+2) * (-0.0003924646781789639d0) & + + in(i-6,j+2) * (-0.0005411255411255411d0) & + + in(i-5,j+2) * (-0.0007936507936507937d0) & + + in(i-4,j+2) * (-0.0012755102040816326d0) & + + in(i-3,j+2) * (-0.002380952380952381d0) & + + in(i-2,j+2) * (-0.005952380952380952d0) & + + in(i-1,j+2) * (-0.03571428571428571d0) & + + in(i+1,j+2) * (0.005952380952380952d0) & + + in(i+2,j+2) * (0.017857142857142856d0) & + + in(i+3,j+2) * (0.002380952380952381d0) & + + in(i+4,j+2) * (0.0012755102040816326d0) & + + in(i+5,j+2) * (0.0007936507936507937d0) & + + in(i+6,j+2) * (0.0005411255411255411d0) & + + in(i+7,j+2) * (0.0003924646781789639d0) & + + in(i-7,j+3) * (-0.0003924646781789639d0) & + + in(i-6,j+3) * (-0.0005411255411255411d0) & + + in(i-5,j+3) * (-0.0007936507936507937d0) & + + in(i-4,j+3) * (-0.0012755102040816326d0) & + + in(i-3,j+3) * (-0.002380952380952381d0) & + + in(i-2,j+3) * (-0.005952380952380952d0) & + + in(i-1,j+3) * (-0.03571428571428571d0) & + + in(i+1,j+3) * (0.002380952380952381d0) & + + in(i+2,j+3) * (0.002380952380952381d0) & + + in(i+3,j+3) * (0.011904761904761904d0) & + + in(i+4,j+3) * (0.0012755102040816326d0) & + + in(i+5,j+3) * (0.0007936507936507937d0) & + + in(i+6,j+3) * (0.0005411255411255411d0) & + + in(i+7,j+3) * (0.0003924646781789639d0) & + + in(i-7,j+4) * (-0.0003924646781789639d0) & + + in(i-6,j+4) * (-0.0005411255411255411d0) & + + in(i-5,j+4) * (-0.0007936507936507937d0) & + + in(i-4,j+4) * (-0.0012755102040816326d0) & + + in(i-3,j+4) * (-0.002380952380952381d0) & + + in(i-2,j+4) * (-0.005952380952380952d0) & + + in(i-1,j+4) * (-0.03571428571428571d0) & + + in(i+1,j+4) * (0.0012755102040816326d0) & + + in(i+2,j+4) * (0.0012755102040816326d0) & + + in(i+3,j+4) * (0.0012755102040816326d0) & + + in(i+4,j+4) * (0.008928571428571428d0) & + + in(i+5,j+4) * (0.0007936507936507937d0) & + + in(i+6,j+4) * (0.0005411255411255411d0) & + + in(i+7,j+4) * (0.0003924646781789639d0) & + + in(i-7,j+5) * (-0.0003924646781789639d0) & + + in(i-6,j+5) * (-0.0005411255411255411d0) & + + in(i-5,j+5) * (-0.0007936507936507937d0) & + + in(i-4,j+5) * (-0.0012755102040816326d0) & + + in(i-3,j+5) * (-0.002380952380952381d0) & + + in(i-2,j+5) * (-0.005952380952380952d0) & + + in(i-1,j+5) * (-0.03571428571428571d0) & + + in(i+1,j+5) * (0.0007936507936507937d0) & + + in(i+2,j+5) * (0.0007936507936507937d0) & + + in(i+3,j+5) * (0.0007936507936507937d0) & + + in(i+4,j+5) * (0.0007936507936507937d0) & + + in(i+5,j+5) * (0.007142857142857143d0) & + + in(i+6,j+5) * (0.0005411255411255411d0) & + + in(i+7,j+5) * (0.0003924646781789639d0) & + + in(i-7,j+6) * (-0.0003924646781789639d0) & + + in(i-6,j+6) * (-0.0005411255411255411d0) & + + in(i-5,j+6) * (-0.0007936507936507937d0) & + + in(i-4,j+6) * (-0.0012755102040816326d0) & + + in(i-3,j+6) * (-0.002380952380952381d0) & + + in(i-2,j+6) * (-0.005952380952380952d0) & + + in(i-1,j+6) * (-0.03571428571428571d0) & + + in(i+1,j+6) * (0.0005411255411255411d0) & + + in(i+2,j+6) * (0.0005411255411255411d0) & + + in(i+3,j+6) * (0.0005411255411255411d0) & + + in(i+4,j+6) * (0.0005411255411255411d0) & + + in(i+5,j+6) * (0.0005411255411255411d0) & + + in(i+6,j+6) * (0.005952380952380952d0) & + + in(i+7,j+6) * (0.0003924646781789639d0) & + + in(i-7,j+7) * (-0.0003924646781789639d0) & + + in(i-6,j+7) * (-0.0005411255411255411d0) & + + in(i-5,j+7) * (-0.0007936507936507937d0) & + + in(i-4,j+7) * (-0.0012755102040816326d0) & + + in(i-3,j+7) * (-0.002380952380952381d0) & + + in(i-2,j+7) * (-0.005952380952380952d0) & + + in(i-1,j+7) * (-0.03571428571428571d0) & + + in(i+1,j+7) * (0.0003924646781789639d0) & + + in(i+2,j+7) * (0.0003924646781789639d0) & + + in(i+3,j+7) * (0.0003924646781789639d0) & + + in(i+4,j+7) * (0.0003924646781789639d0) & + + in(i+5,j+7) * (0.0003924646781789639d0) & + + in(i+6,j+7) * (0.0003924646781789639d0) & + + in(i+7,j+7) * (0.00510204081632653d0) & +0.0 end do end do @@ -876,206 +876,206 @@ subroutine grid8(n, in, out) do i=8,n-8-1 do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i-8,j-8) * (-0.00390625) & - + in(i+1,j-8) * (-0.00026041666666666666) & - + in(i+2,j-8) * (-0.00026041666666666666) & - + in(i+3,j-8) * (-0.00026041666666666666) & - + in(i+4,j-8) * (-0.00026041666666666666) & - + in(i+5,j-8) * (-0.00026041666666666666) & - + in(i+6,j-8) * (-0.00026041666666666666) & - + in(i+7,j-8) * (-0.00026041666666666666) & - + in(i+8,j-8) * (-0.00026041666666666666) & - + in(i-7,j-7) * (-0.004464285714285714) & - + in(i+1,j-7) * (-0.00034340659340659343) & - + in(i+2,j-7) * (-0.00034340659340659343) & - + in(i+3,j-7) * (-0.00034340659340659343) & - + in(i+4,j-7) * (-0.00034340659340659343) & - + in(i+5,j-7) * (-0.00034340659340659343) & - + in(i+6,j-7) * (-0.00034340659340659343) & - + in(i+7,j-7) * (-0.00034340659340659343) & - + in(i+8,j-7) * (-0.00034340659340659343) & - + in(i-6,j-6) * (-0.005208333333333333) & - + in(i+1,j-6) * (-0.0004734848484848485) & - + in(i+2,j-6) * (-0.0004734848484848485) & - + in(i+3,j-6) * (-0.0004734848484848485) & - + in(i+4,j-6) * (-0.0004734848484848485) & - + in(i+5,j-6) * (-0.0004734848484848485) & - + in(i+6,j-6) * (-0.0004734848484848485) & - + in(i+7,j-6) * (-0.0004734848484848485) & - + in(i+8,j-6) * (-0.0004734848484848485) & - + in(i-5,j-5) * (-0.00625) & - + in(i+1,j-5) * (-0.0006944444444444445) & - + in(i+2,j-5) * (-0.0006944444444444445) & - + in(i+3,j-5) * (-0.0006944444444444445) & - + in(i+4,j-5) * (-0.0006944444444444445) & - + in(i+5,j-5) * (-0.0006944444444444445) & - + in(i+6,j-5) * (-0.0006944444444444445) & - + in(i+7,j-5) * (-0.0006944444444444445) & - + in(i+8,j-5) * (-0.0006944444444444445) & - + in(i-4,j-4) * (-0.0078125) & - + in(i+1,j-4) * (-0.0011160714285714285) & - + in(i+2,j-4) * (-0.0011160714285714285) & - + in(i+3,j-4) * (-0.0011160714285714285) & - + in(i+4,j-4) * (-0.0011160714285714285) & - + in(i+5,j-4) * (-0.0011160714285714285) & - + in(i+6,j-4) * (-0.0011160714285714285) & - + in(i+7,j-4) * (-0.0011160714285714285) & - + in(i+8,j-4) * (-0.0011160714285714285) & - + in(i-3,j-3) * (-0.010416666666666666) & - + in(i+1,j-3) * (-0.0020833333333333333) & - + in(i+2,j-3) * (-0.0020833333333333333) & - + in(i+3,j-3) * (-0.0020833333333333333) & - + in(i+4,j-3) * (-0.0020833333333333333) & - + in(i+5,j-3) * (-0.0020833333333333333) & - + in(i+6,j-3) * (-0.0020833333333333333) & - + in(i+7,j-3) * (-0.0020833333333333333) & - + in(i+8,j-3) * (-0.0020833333333333333) & - + in(i-2,j-2) * (-0.015625) & - + in(i+1,j-2) * (-0.005208333333333333) & - + in(i+2,j-2) * (-0.005208333333333333) & - + in(i+3,j-2) * (-0.005208333333333333) & - + in(i+4,j-2) * (-0.005208333333333333) & - + in(i+5,j-2) * (-0.005208333333333333) & - + in(i+6,j-2) * (-0.005208333333333333) & - + in(i+7,j-2) * (-0.005208333333333333) & - + in(i+8,j-2) * (-0.005208333333333333) & - + in(i-1,j-1) * (-0.03125) & - + in(i+1,j-1) * (-0.03125) & - + in(i+2,j-1) * (-0.03125) & - + in(i+3,j-1) * (-0.03125) & - + in(i+4,j-1) * (-0.03125) & - + in(i+5,j-1) * (-0.03125) & - + in(i+6,j-1) * (-0.03125) & - + in(i+7,j-1) * (-0.03125) & - + in(i+8,j-1) * (-0.03125) & - + in(i-8,j+1) * (-0.00026041666666666666) & - + in(i-7,j+1) * (-0.00034340659340659343) & - + in(i-6,j+1) * (-0.0004734848484848485) & - + in(i-5,j+1) * (-0.0006944444444444445) & - + in(i-4,j+1) * (-0.0011160714285714285) & - + in(i-3,j+1) * (-0.0020833333333333333) & - + in(i-2,j+1) * (-0.005208333333333333) & - + in(i-1,j+1) * (-0.03125) & - + in(i+1,j+1) * (0.03125) & - + in(i+2,j+1) * (0.005208333333333333) & - + in(i+3,j+1) * (0.0020833333333333333) & - + in(i+4,j+1) * (0.0011160714285714285) & - + in(i+5,j+1) * (0.0006944444444444445) & - + in(i+6,j+1) * (0.0004734848484848485) & - + in(i+7,j+1) * (0.00034340659340659343) & - + in(i+8,j+1) * (0.00026041666666666666) & - + in(i-8,j+2) * (-0.00026041666666666666) & - + in(i-7,j+2) * (-0.00034340659340659343) & - + in(i-6,j+2) * (-0.0004734848484848485) & - + in(i-5,j+2) * (-0.0006944444444444445) & - + in(i-4,j+2) * (-0.0011160714285714285) & - + in(i-3,j+2) * (-0.0020833333333333333) & - + in(i-2,j+2) * (-0.005208333333333333) & - + in(i-1,j+2) * (-0.03125) & - + in(i+1,j+2) * (0.005208333333333333) & - + in(i+2,j+2) * (0.015625) & - + in(i+3,j+2) * (0.0020833333333333333) & - + in(i+4,j+2) * (0.0011160714285714285) & - + in(i+5,j+2) * (0.0006944444444444445) & - + in(i+6,j+2) * (0.0004734848484848485) & - + in(i+7,j+2) * (0.00034340659340659343) & - + in(i+8,j+2) * (0.00026041666666666666) & - + in(i-8,j+3) * (-0.00026041666666666666) & - + in(i-7,j+3) * (-0.00034340659340659343) & - + in(i-6,j+3) * (-0.0004734848484848485) & - + in(i-5,j+3) * (-0.0006944444444444445) & - + in(i-4,j+3) * (-0.0011160714285714285) & - + in(i-3,j+3) * (-0.0020833333333333333) & - + in(i-2,j+3) * (-0.005208333333333333) & - + in(i-1,j+3) * (-0.03125) & - + in(i+1,j+3) * (0.0020833333333333333) & - + in(i+2,j+3) * (0.0020833333333333333) & - + in(i+3,j+3) * (0.010416666666666666) & - + in(i+4,j+3) * (0.0011160714285714285) & - + in(i+5,j+3) * (0.0006944444444444445) & - + in(i+6,j+3) * (0.0004734848484848485) & - + in(i+7,j+3) * (0.00034340659340659343) & - + in(i+8,j+3) * (0.00026041666666666666) & - + in(i-8,j+4) * (-0.00026041666666666666) & - + in(i-7,j+4) * (-0.00034340659340659343) & - + in(i-6,j+4) * (-0.0004734848484848485) & - + in(i-5,j+4) * (-0.0006944444444444445) & - + in(i-4,j+4) * (-0.0011160714285714285) & - + in(i-3,j+4) * (-0.0020833333333333333) & - + in(i-2,j+4) * (-0.005208333333333333) & - + in(i-1,j+4) * (-0.03125) & - + in(i+1,j+4) * (0.0011160714285714285) & - + in(i+2,j+4) * (0.0011160714285714285) & - + in(i+3,j+4) * (0.0011160714285714285) & - + in(i+4,j+4) * (0.0078125) & - + in(i+5,j+4) * (0.0006944444444444445) & - + in(i+6,j+4) * (0.0004734848484848485) & - + in(i+7,j+4) * (0.00034340659340659343) & - + in(i+8,j+4) * (0.00026041666666666666) & - + in(i-8,j+5) * (-0.00026041666666666666) & - + in(i-7,j+5) * (-0.00034340659340659343) & - + in(i-6,j+5) * (-0.0004734848484848485) & - + in(i-5,j+5) * (-0.0006944444444444445) & - + in(i-4,j+5) * (-0.0011160714285714285) & - + in(i-3,j+5) * (-0.0020833333333333333) & - + in(i-2,j+5) * (-0.005208333333333333) & - + in(i-1,j+5) * (-0.03125) & - + in(i+1,j+5) * (0.0006944444444444445) & - + in(i+2,j+5) * (0.0006944444444444445) & - + in(i+3,j+5) * (0.0006944444444444445) & - + in(i+4,j+5) * (0.0006944444444444445) & - + in(i+5,j+5) * (0.00625) & - + in(i+6,j+5) * (0.0004734848484848485) & - + in(i+7,j+5) * (0.00034340659340659343) & - + in(i+8,j+5) * (0.00026041666666666666) & - + in(i-8,j+6) * (-0.00026041666666666666) & - + in(i-7,j+6) * (-0.00034340659340659343) & - + in(i-6,j+6) * (-0.0004734848484848485) & - + in(i-5,j+6) * (-0.0006944444444444445) & - + in(i-4,j+6) * (-0.0011160714285714285) & - + in(i-3,j+6) * (-0.0020833333333333333) & - + in(i-2,j+6) * (-0.005208333333333333) & - + in(i-1,j+6) * (-0.03125) & - + in(i+1,j+6) * (0.0004734848484848485) & - + in(i+2,j+6) * (0.0004734848484848485) & - + in(i+3,j+6) * (0.0004734848484848485) & - + in(i+4,j+6) * (0.0004734848484848485) & - + in(i+5,j+6) * (0.0004734848484848485) & - + in(i+6,j+6) * (0.005208333333333333) & - + in(i+7,j+6) * (0.00034340659340659343) & - + in(i+8,j+6) * (0.00026041666666666666) & - + in(i-8,j+7) * (-0.00026041666666666666) & - + in(i-7,j+7) * (-0.00034340659340659343) & - + in(i-6,j+7) * (-0.0004734848484848485) & - + in(i-5,j+7) * (-0.0006944444444444445) & - + in(i-4,j+7) * (-0.0011160714285714285) & - + in(i-3,j+7) * (-0.0020833333333333333) & - + in(i-2,j+7) * (-0.005208333333333333) & - + in(i-1,j+7) * (-0.03125) & - + in(i+1,j+7) * (0.00034340659340659343) & - + in(i+2,j+7) * (0.00034340659340659343) & - + in(i+3,j+7) * (0.00034340659340659343) & - + in(i+4,j+7) * (0.00034340659340659343) & - + in(i+5,j+7) * (0.00034340659340659343) & - + in(i+6,j+7) * (0.00034340659340659343) & - + in(i+7,j+7) * (0.004464285714285714) & - + in(i+8,j+7) * (0.00026041666666666666) & - + in(i-8,j+8) * (-0.00026041666666666666) & - + in(i-7,j+8) * (-0.00034340659340659343) & - + in(i-6,j+8) * (-0.0004734848484848485) & - + in(i-5,j+8) * (-0.0006944444444444445) & - + in(i-4,j+8) * (-0.0011160714285714285) & - + in(i-3,j+8) * (-0.0020833333333333333) & - + in(i-2,j+8) * (-0.005208333333333333) & - + in(i-1,j+8) * (-0.03125) & - + in(i+1,j+8) * (0.00026041666666666666) & - + in(i+2,j+8) * (0.00026041666666666666) & - + in(i+3,j+8) * (0.00026041666666666666) & - + in(i+4,j+8) * (0.00026041666666666666) & - + in(i+5,j+8) * (0.00026041666666666666) & - + in(i+6,j+8) * (0.00026041666666666666) & - + in(i+7,j+8) * (0.00026041666666666666) & - + in(i+8,j+8) * (0.00390625) & + + in(i-8,j-8) * (-0.00390625d0) & + + in(i+1,j-8) * (-0.00026041666666666666d0) & + + in(i+2,j-8) * (-0.00026041666666666666d0) & + + in(i+3,j-8) * (-0.00026041666666666666d0) & + + in(i+4,j-8) * (-0.00026041666666666666d0) & + + in(i+5,j-8) * (-0.00026041666666666666d0) & + + in(i+6,j-8) * (-0.00026041666666666666d0) & + + in(i+7,j-8) * (-0.00026041666666666666d0) & + + in(i+8,j-8) * (-0.00026041666666666666d0) & + + in(i-7,j-7) * (-0.004464285714285714d0) & + + in(i+1,j-7) * (-0.00034340659340659343d0) & + + in(i+2,j-7) * (-0.00034340659340659343d0) & + + in(i+3,j-7) * (-0.00034340659340659343d0) & + + in(i+4,j-7) * (-0.00034340659340659343d0) & + + in(i+5,j-7) * (-0.00034340659340659343d0) & + + in(i+6,j-7) * (-0.00034340659340659343d0) & + + in(i+7,j-7) * (-0.00034340659340659343d0) & + + in(i+8,j-7) * (-0.00034340659340659343d0) & + + in(i-6,j-6) * (-0.005208333333333333d0) & + + in(i+1,j-6) * (-0.0004734848484848485d0) & + + in(i+2,j-6) * (-0.0004734848484848485d0) & + + in(i+3,j-6) * (-0.0004734848484848485d0) & + + in(i+4,j-6) * (-0.0004734848484848485d0) & + + in(i+5,j-6) * (-0.0004734848484848485d0) & + + in(i+6,j-6) * (-0.0004734848484848485d0) & + + in(i+7,j-6) * (-0.0004734848484848485d0) & + + in(i+8,j-6) * (-0.0004734848484848485d0) & + + in(i-5,j-5) * (-0.00625d0) & + + in(i+1,j-5) * (-0.0006944444444444445d0) & + + in(i+2,j-5) * (-0.0006944444444444445d0) & + + in(i+3,j-5) * (-0.0006944444444444445d0) & + + in(i+4,j-5) * (-0.0006944444444444445d0) & + + in(i+5,j-5) * (-0.0006944444444444445d0) & + + in(i+6,j-5) * (-0.0006944444444444445d0) & + + in(i+7,j-5) * (-0.0006944444444444445d0) & + + in(i+8,j-5) * (-0.0006944444444444445d0) & + + in(i-4,j-4) * (-0.0078125d0) & + + in(i+1,j-4) * (-0.0011160714285714285d0) & + + in(i+2,j-4) * (-0.0011160714285714285d0) & + + in(i+3,j-4) * (-0.0011160714285714285d0) & + + in(i+4,j-4) * (-0.0011160714285714285d0) & + + in(i+5,j-4) * (-0.0011160714285714285d0) & + + in(i+6,j-4) * (-0.0011160714285714285d0) & + + in(i+7,j-4) * (-0.0011160714285714285d0) & + + in(i+8,j-4) * (-0.0011160714285714285d0) & + + in(i-3,j-3) * (-0.010416666666666666d0) & + + in(i+1,j-3) * (-0.0020833333333333333d0) & + + in(i+2,j-3) * (-0.0020833333333333333d0) & + + in(i+3,j-3) * (-0.0020833333333333333d0) & + + in(i+4,j-3) * (-0.0020833333333333333d0) & + + in(i+5,j-3) * (-0.0020833333333333333d0) & + + in(i+6,j-3) * (-0.0020833333333333333d0) & + + in(i+7,j-3) * (-0.0020833333333333333d0) & + + in(i+8,j-3) * (-0.0020833333333333333d0) & + + in(i-2,j-2) * (-0.015625d0) & + + in(i+1,j-2) * (-0.005208333333333333d0) & + + in(i+2,j-2) * (-0.005208333333333333d0) & + + in(i+3,j-2) * (-0.005208333333333333d0) & + + in(i+4,j-2) * (-0.005208333333333333d0) & + + in(i+5,j-2) * (-0.005208333333333333d0) & + + in(i+6,j-2) * (-0.005208333333333333d0) & + + in(i+7,j-2) * (-0.005208333333333333d0) & + + in(i+8,j-2) * (-0.005208333333333333d0) & + + in(i-1,j-1) * (-0.03125d0) & + + in(i+1,j-1) * (-0.03125d0) & + + in(i+2,j-1) * (-0.03125d0) & + + in(i+3,j-1) * (-0.03125d0) & + + in(i+4,j-1) * (-0.03125d0) & + + in(i+5,j-1) * (-0.03125d0) & + + in(i+6,j-1) * (-0.03125d0) & + + in(i+7,j-1) * (-0.03125d0) & + + in(i+8,j-1) * (-0.03125d0) & + + in(i-8,j+1) * (-0.00026041666666666666d0) & + + in(i-7,j+1) * (-0.00034340659340659343d0) & + + in(i-6,j+1) * (-0.0004734848484848485d0) & + + in(i-5,j+1) * (-0.0006944444444444445d0) & + + in(i-4,j+1) * (-0.0011160714285714285d0) & + + in(i-3,j+1) * (-0.0020833333333333333d0) & + + in(i-2,j+1) * (-0.005208333333333333d0) & + + in(i-1,j+1) * (-0.03125d0) & + + in(i+1,j+1) * (0.03125d0) & + + in(i+2,j+1) * (0.005208333333333333d0) & + + in(i+3,j+1) * (0.0020833333333333333d0) & + + in(i+4,j+1) * (0.0011160714285714285d0) & + + in(i+5,j+1) * (0.0006944444444444445d0) & + + in(i+6,j+1) * (0.0004734848484848485d0) & + + in(i+7,j+1) * (0.00034340659340659343d0) & + + in(i+8,j+1) * (0.00026041666666666666d0) & + + in(i-8,j+2) * (-0.00026041666666666666d0) & + + in(i-7,j+2) * (-0.00034340659340659343d0) & + + in(i-6,j+2) * (-0.0004734848484848485d0) & + + in(i-5,j+2) * (-0.0006944444444444445d0) & + + in(i-4,j+2) * (-0.0011160714285714285d0) & + + in(i-3,j+2) * (-0.0020833333333333333d0) & + + in(i-2,j+2) * (-0.005208333333333333d0) & + + in(i-1,j+2) * (-0.03125d0) & + + in(i+1,j+2) * (0.005208333333333333d0) & + + in(i+2,j+2) * (0.015625d0) & + + in(i+3,j+2) * (0.0020833333333333333d0) & + + in(i+4,j+2) * (0.0011160714285714285d0) & + + in(i+5,j+2) * (0.0006944444444444445d0) & + + in(i+6,j+2) * (0.0004734848484848485d0) & + + in(i+7,j+2) * (0.00034340659340659343d0) & + + in(i+8,j+2) * (0.00026041666666666666d0) & + + in(i-8,j+3) * (-0.00026041666666666666d0) & + + in(i-7,j+3) * (-0.00034340659340659343d0) & + + in(i-6,j+3) * (-0.0004734848484848485d0) & + + in(i-5,j+3) * (-0.0006944444444444445d0) & + + in(i-4,j+3) * (-0.0011160714285714285d0) & + + in(i-3,j+3) * (-0.0020833333333333333d0) & + + in(i-2,j+3) * (-0.005208333333333333d0) & + + in(i-1,j+3) * (-0.03125d0) & + + in(i+1,j+3) * (0.0020833333333333333d0) & + + in(i+2,j+3) * (0.0020833333333333333d0) & + + in(i+3,j+3) * (0.010416666666666666d0) & + + in(i+4,j+3) * (0.0011160714285714285d0) & + + in(i+5,j+3) * (0.0006944444444444445d0) & + + in(i+6,j+3) * (0.0004734848484848485d0) & + + in(i+7,j+3) * (0.00034340659340659343d0) & + + in(i+8,j+3) * (0.00026041666666666666d0) & + + in(i-8,j+4) * (-0.00026041666666666666d0) & + + in(i-7,j+4) * (-0.00034340659340659343d0) & + + in(i-6,j+4) * (-0.0004734848484848485d0) & + + in(i-5,j+4) * (-0.0006944444444444445d0) & + + in(i-4,j+4) * (-0.0011160714285714285d0) & + + in(i-3,j+4) * (-0.0020833333333333333d0) & + + in(i-2,j+4) * (-0.005208333333333333d0) & + + in(i-1,j+4) * (-0.03125d0) & + + in(i+1,j+4) * (0.0011160714285714285d0) & + + in(i+2,j+4) * (0.0011160714285714285d0) & + + in(i+3,j+4) * (0.0011160714285714285d0) & + + in(i+4,j+4) * (0.0078125d0) & + + in(i+5,j+4) * (0.0006944444444444445d0) & + + in(i+6,j+4) * (0.0004734848484848485d0) & + + in(i+7,j+4) * (0.00034340659340659343d0) & + + in(i+8,j+4) * (0.00026041666666666666d0) & + + in(i-8,j+5) * (-0.00026041666666666666d0) & + + in(i-7,j+5) * (-0.00034340659340659343d0) & + + in(i-6,j+5) * (-0.0004734848484848485d0) & + + in(i-5,j+5) * (-0.0006944444444444445d0) & + + in(i-4,j+5) * (-0.0011160714285714285d0) & + + in(i-3,j+5) * (-0.0020833333333333333d0) & + + in(i-2,j+5) * (-0.005208333333333333d0) & + + in(i-1,j+5) * (-0.03125d0) & + + in(i+1,j+5) * (0.0006944444444444445d0) & + + in(i+2,j+5) * (0.0006944444444444445d0) & + + in(i+3,j+5) * (0.0006944444444444445d0) & + + in(i+4,j+5) * (0.0006944444444444445d0) & + + in(i+5,j+5) * (0.00625d0) & + + in(i+6,j+5) * (0.0004734848484848485d0) & + + in(i+7,j+5) * (0.00034340659340659343d0) & + + in(i+8,j+5) * (0.00026041666666666666d0) & + + in(i-8,j+6) * (-0.00026041666666666666d0) & + + in(i-7,j+6) * (-0.00034340659340659343d0) & + + in(i-6,j+6) * (-0.0004734848484848485d0) & + + in(i-5,j+6) * (-0.0006944444444444445d0) & + + in(i-4,j+6) * (-0.0011160714285714285d0) & + + in(i-3,j+6) * (-0.0020833333333333333d0) & + + in(i-2,j+6) * (-0.005208333333333333d0) & + + in(i-1,j+6) * (-0.03125d0) & + + in(i+1,j+6) * (0.0004734848484848485d0) & + + in(i+2,j+6) * (0.0004734848484848485d0) & + + in(i+3,j+6) * (0.0004734848484848485d0) & + + in(i+4,j+6) * (0.0004734848484848485d0) & + + in(i+5,j+6) * (0.0004734848484848485d0) & + + in(i+6,j+6) * (0.005208333333333333d0) & + + in(i+7,j+6) * (0.00034340659340659343d0) & + + in(i+8,j+6) * (0.00026041666666666666d0) & + + in(i-8,j+7) * (-0.00026041666666666666d0) & + + in(i-7,j+7) * (-0.00034340659340659343d0) & + + in(i-6,j+7) * (-0.0004734848484848485d0) & + + in(i-5,j+7) * (-0.0006944444444444445d0) & + + in(i-4,j+7) * (-0.0011160714285714285d0) & + + in(i-3,j+7) * (-0.0020833333333333333d0) & + + in(i-2,j+7) * (-0.005208333333333333d0) & + + in(i-1,j+7) * (-0.03125d0) & + + in(i+1,j+7) * (0.00034340659340659343d0) & + + in(i+2,j+7) * (0.00034340659340659343d0) & + + in(i+3,j+7) * (0.00034340659340659343d0) & + + in(i+4,j+7) * (0.00034340659340659343d0) & + + in(i+5,j+7) * (0.00034340659340659343d0) & + + in(i+6,j+7) * (0.00034340659340659343d0) & + + in(i+7,j+7) * (0.004464285714285714d0) & + + in(i+8,j+7) * (0.00026041666666666666d0) & + + in(i-8,j+8) * (-0.00026041666666666666d0) & + + in(i-7,j+8) * (-0.00034340659340659343d0) & + + in(i-6,j+8) * (-0.0004734848484848485d0) & + + in(i-5,j+8) * (-0.0006944444444444445d0) & + + in(i-4,j+8) * (-0.0011160714285714285d0) & + + in(i-3,j+8) * (-0.0020833333333333333d0) & + + in(i-2,j+8) * (-0.005208333333333333d0) & + + in(i-1,j+8) * (-0.03125d0) & + + in(i+1,j+8) * (0.00026041666666666666d0) & + + in(i+2,j+8) * (0.00026041666666666666d0) & + + in(i+3,j+8) * (0.00026041666666666666d0) & + + in(i+4,j+8) * (0.00026041666666666666d0) & + + in(i+5,j+8) * (0.00026041666666666666d0) & + + in(i+6,j+8) * (0.00026041666666666666d0) & + + in(i+7,j+8) * (0.00026041666666666666d0) & + + in(i+8,j+8) * (0.00390625d0) & +0.0 end do end do @@ -1091,258 +1091,258 @@ subroutine grid9(n, in, out) do i=9,n-9-1 do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i-9,j-9) * (-0.0030864197530864196) & - + in(i+1,j-9) * (-0.00018155410312273057) & - + in(i+2,j-9) * (-0.00018155410312273057) & - + in(i+3,j-9) * (-0.00018155410312273057) & - + in(i+4,j-9) * (-0.00018155410312273057) & - + in(i+5,j-9) * (-0.00018155410312273057) & - + in(i+6,j-9) * (-0.00018155410312273057) & - + in(i+7,j-9) * (-0.00018155410312273057) & - + in(i+8,j-9) * (-0.00018155410312273057) & - + in(i+9,j-9) * (-0.00018155410312273057) & - + in(i-8,j-8) * (-0.003472222222222222) & - + in(i+1,j-8) * (-0.0002314814814814815) & - + in(i+2,j-8) * (-0.0002314814814814815) & - + in(i+3,j-8) * (-0.0002314814814814815) & - + in(i+4,j-8) * (-0.0002314814814814815) & - + in(i+5,j-8) * (-0.0002314814814814815) & - + in(i+6,j-8) * (-0.0002314814814814815) & - + in(i+7,j-8) * (-0.0002314814814814815) & - + in(i+8,j-8) * (-0.0002314814814814815) & - + in(i+9,j-8) * (-0.0002314814814814815) & - + in(i-7,j-7) * (-0.003968253968253968) & - + in(i+1,j-7) * (-0.00030525030525030525) & - + in(i+2,j-7) * (-0.00030525030525030525) & - + in(i+3,j-7) * (-0.00030525030525030525) & - + in(i+4,j-7) * (-0.00030525030525030525) & - + in(i+5,j-7) * (-0.00030525030525030525) & - + in(i+6,j-7) * (-0.00030525030525030525) & - + in(i+7,j-7) * (-0.00030525030525030525) & - + in(i+8,j-7) * (-0.00030525030525030525) & - + in(i+9,j-7) * (-0.00030525030525030525) & - + in(i-6,j-6) * (-0.004629629629629629) & - + in(i+1,j-6) * (-0.00042087542087542086) & - + in(i+2,j-6) * (-0.00042087542087542086) & - + in(i+3,j-6) * (-0.00042087542087542086) & - + in(i+4,j-6) * (-0.00042087542087542086) & - + in(i+5,j-6) * (-0.00042087542087542086) & - + in(i+6,j-6) * (-0.00042087542087542086) & - + in(i+7,j-6) * (-0.00042087542087542086) & - + in(i+8,j-6) * (-0.00042087542087542086) & - + in(i+9,j-6) * (-0.00042087542087542086) & - + in(i-5,j-5) * (-0.005555555555555556) & - + in(i+1,j-5) * (-0.0006172839506172839) & - + in(i+2,j-5) * (-0.0006172839506172839) & - + in(i+3,j-5) * (-0.0006172839506172839) & - + in(i+4,j-5) * (-0.0006172839506172839) & - + in(i+5,j-5) * (-0.0006172839506172839) & - + in(i+6,j-5) * (-0.0006172839506172839) & - + in(i+7,j-5) * (-0.0006172839506172839) & - + in(i+8,j-5) * (-0.0006172839506172839) & - + in(i+9,j-5) * (-0.0006172839506172839) & - + in(i-4,j-4) * (-0.006944444444444444) & - + in(i+1,j-4) * (-0.000992063492063492) & - + in(i+2,j-4) * (-0.000992063492063492) & - + in(i+3,j-4) * (-0.000992063492063492) & - + in(i+4,j-4) * (-0.000992063492063492) & - + in(i+5,j-4) * (-0.000992063492063492) & - + in(i+6,j-4) * (-0.000992063492063492) & - + in(i+7,j-4) * (-0.000992063492063492) & - + in(i+8,j-4) * (-0.000992063492063492) & - + in(i+9,j-4) * (-0.000992063492063492) & - + in(i-3,j-3) * (-0.009259259259259259) & - + in(i+1,j-3) * (-0.001851851851851852) & - + in(i+2,j-3) * (-0.001851851851851852) & - + in(i+3,j-3) * (-0.001851851851851852) & - + in(i+4,j-3) * (-0.001851851851851852) & - + in(i+5,j-3) * (-0.001851851851851852) & - + in(i+6,j-3) * (-0.001851851851851852) & - + in(i+7,j-3) * (-0.001851851851851852) & - + in(i+8,j-3) * (-0.001851851851851852) & - + in(i+9,j-3) * (-0.001851851851851852) & - + in(i-2,j-2) * (-0.013888888888888888) & - + in(i+1,j-2) * (-0.004629629629629629) & - + in(i+2,j-2) * (-0.004629629629629629) & - + in(i+3,j-2) * (-0.004629629629629629) & - + in(i+4,j-2) * (-0.004629629629629629) & - + in(i+5,j-2) * (-0.004629629629629629) & - + in(i+6,j-2) * (-0.004629629629629629) & - + in(i+7,j-2) * (-0.004629629629629629) & - + in(i+8,j-2) * (-0.004629629629629629) & - + in(i+9,j-2) * (-0.004629629629629629) & - + in(i-1,j-1) * (-0.027777777777777776) & - + in(i+1,j-1) * (-0.027777777777777776) & - + in(i+2,j-1) * (-0.027777777777777776) & - + in(i+3,j-1) * (-0.027777777777777776) & - + in(i+4,j-1) * (-0.027777777777777776) & - + in(i+5,j-1) * (-0.027777777777777776) & - + in(i+6,j-1) * (-0.027777777777777776) & - + in(i+7,j-1) * (-0.027777777777777776) & - + in(i+8,j-1) * (-0.027777777777777776) & - + in(i+9,j-1) * (-0.027777777777777776) & - + in(i-9,j+1) * (-0.00018155410312273057) & - + in(i-8,j+1) * (-0.0002314814814814815) & - + in(i-7,j+1) * (-0.00030525030525030525) & - + in(i-6,j+1) * (-0.00042087542087542086) & - + in(i-5,j+1) * (-0.0006172839506172839) & - + in(i-4,j+1) * (-0.000992063492063492) & - + in(i-3,j+1) * (-0.001851851851851852) & - + in(i-2,j+1) * (-0.004629629629629629) & - + in(i-1,j+1) * (-0.027777777777777776) & - + in(i+1,j+1) * (0.027777777777777776) & - + in(i+2,j+1) * (0.004629629629629629) & - + in(i+3,j+1) * (0.001851851851851852) & - + in(i+4,j+1) * (0.000992063492063492) & - + in(i+5,j+1) * (0.0006172839506172839) & - + in(i+6,j+1) * (0.00042087542087542086) & - + in(i+7,j+1) * (0.00030525030525030525) & - + in(i+8,j+1) * (0.0002314814814814815) & - + in(i+9,j+1) * (0.00018155410312273057) & - + in(i-9,j+2) * (-0.00018155410312273057) & - + in(i-8,j+2) * (-0.0002314814814814815) & - + in(i-7,j+2) * (-0.00030525030525030525) & - + in(i-6,j+2) * (-0.00042087542087542086) & - + in(i-5,j+2) * (-0.0006172839506172839) & - + in(i-4,j+2) * (-0.000992063492063492) & - + in(i-3,j+2) * (-0.001851851851851852) & - + in(i-2,j+2) * (-0.004629629629629629) & - + in(i-1,j+2) * (-0.027777777777777776) & - + in(i+1,j+2) * (0.004629629629629629) & - + in(i+2,j+2) * (0.013888888888888888) & - + in(i+3,j+2) * (0.001851851851851852) & - + in(i+4,j+2) * (0.000992063492063492) & - + in(i+5,j+2) * (0.0006172839506172839) & - + in(i+6,j+2) * (0.00042087542087542086) & - + in(i+7,j+2) * (0.00030525030525030525) & - + in(i+8,j+2) * (0.0002314814814814815) & - + in(i+9,j+2) * (0.00018155410312273057) & - + in(i-9,j+3) * (-0.00018155410312273057) & - + in(i-8,j+3) * (-0.0002314814814814815) & - + in(i-7,j+3) * (-0.00030525030525030525) & - + in(i-6,j+3) * (-0.00042087542087542086) & - + in(i-5,j+3) * (-0.0006172839506172839) & - + in(i-4,j+3) * (-0.000992063492063492) & - + in(i-3,j+3) * (-0.001851851851851852) & - + in(i-2,j+3) * (-0.004629629629629629) & - + in(i-1,j+3) * (-0.027777777777777776) & - + in(i+1,j+3) * (0.001851851851851852) & - + in(i+2,j+3) * (0.001851851851851852) & - + in(i+3,j+3) * (0.009259259259259259) & - + in(i+4,j+3) * (0.000992063492063492) & - + in(i+5,j+3) * (0.0006172839506172839) & - + in(i+6,j+3) * (0.00042087542087542086) & - + in(i+7,j+3) * (0.00030525030525030525) & - + in(i+8,j+3) * (0.0002314814814814815) & - + in(i+9,j+3) * (0.00018155410312273057) & - + in(i-9,j+4) * (-0.00018155410312273057) & - + in(i-8,j+4) * (-0.0002314814814814815) & - + in(i-7,j+4) * (-0.00030525030525030525) & - + in(i-6,j+4) * (-0.00042087542087542086) & - + in(i-5,j+4) * (-0.0006172839506172839) & - + in(i-4,j+4) * (-0.000992063492063492) & - + in(i-3,j+4) * (-0.001851851851851852) & - + in(i-2,j+4) * (-0.004629629629629629) & - + in(i-1,j+4) * (-0.027777777777777776) & - + in(i+1,j+4) * (0.000992063492063492) & - + in(i+2,j+4) * (0.000992063492063492) & - + in(i+3,j+4) * (0.000992063492063492) & - + in(i+4,j+4) * (0.006944444444444444) & - + in(i+5,j+4) * (0.0006172839506172839) & - + in(i+6,j+4) * (0.00042087542087542086) & - + in(i+7,j+4) * (0.00030525030525030525) & - + in(i+8,j+4) * (0.0002314814814814815) & - + in(i+9,j+4) * (0.00018155410312273057) & - + in(i-9,j+5) * (-0.00018155410312273057) & - + in(i-8,j+5) * (-0.0002314814814814815) & - + in(i-7,j+5) * (-0.00030525030525030525) & - + in(i-6,j+5) * (-0.00042087542087542086) & - + in(i-5,j+5) * (-0.0006172839506172839) & - + in(i-4,j+5) * (-0.000992063492063492) & - + in(i-3,j+5) * (-0.001851851851851852) & - + in(i-2,j+5) * (-0.004629629629629629) & - + in(i-1,j+5) * (-0.027777777777777776) & - + in(i+1,j+5) * (0.0006172839506172839) & - + in(i+2,j+5) * (0.0006172839506172839) & - + in(i+3,j+5) * (0.0006172839506172839) & - + in(i+4,j+5) * (0.0006172839506172839) & - + in(i+5,j+5) * (0.005555555555555556) & - + in(i+6,j+5) * (0.00042087542087542086) & - + in(i+7,j+5) * (0.00030525030525030525) & - + in(i+8,j+5) * (0.0002314814814814815) & - + in(i+9,j+5) * (0.00018155410312273057) & - + in(i-9,j+6) * (-0.00018155410312273057) & - + in(i-8,j+6) * (-0.0002314814814814815) & - + in(i-7,j+6) * (-0.00030525030525030525) & - + in(i-6,j+6) * (-0.00042087542087542086) & - + in(i-5,j+6) * (-0.0006172839506172839) & - + in(i-4,j+6) * (-0.000992063492063492) & - + in(i-3,j+6) * (-0.001851851851851852) & - + in(i-2,j+6) * (-0.004629629629629629) & - + in(i-1,j+6) * (-0.027777777777777776) & - + in(i+1,j+6) * (0.00042087542087542086) & - + in(i+2,j+6) * (0.00042087542087542086) & - + in(i+3,j+6) * (0.00042087542087542086) & - + in(i+4,j+6) * (0.00042087542087542086) & - + in(i+5,j+6) * (0.00042087542087542086) & - + in(i+6,j+6) * (0.004629629629629629) & - + in(i+7,j+6) * (0.00030525030525030525) & - + in(i+8,j+6) * (0.0002314814814814815) & - + in(i+9,j+6) * (0.00018155410312273057) & - + in(i-9,j+7) * (-0.00018155410312273057) & - + in(i-8,j+7) * (-0.0002314814814814815) & - + in(i-7,j+7) * (-0.00030525030525030525) & - + in(i-6,j+7) * (-0.00042087542087542086) & - + in(i-5,j+7) * (-0.0006172839506172839) & - + in(i-4,j+7) * (-0.000992063492063492) & - + in(i-3,j+7) * (-0.001851851851851852) & - + in(i-2,j+7) * (-0.004629629629629629) & - + in(i-1,j+7) * (-0.027777777777777776) & - + in(i+1,j+7) * (0.00030525030525030525) & - + in(i+2,j+7) * (0.00030525030525030525) & - + in(i+3,j+7) * (0.00030525030525030525) & - + in(i+4,j+7) * (0.00030525030525030525) & - + in(i+5,j+7) * (0.00030525030525030525) & - + in(i+6,j+7) * (0.00030525030525030525) & - + in(i+7,j+7) * (0.003968253968253968) & - + in(i+8,j+7) * (0.0002314814814814815) & - + in(i+9,j+7) * (0.00018155410312273057) & - + in(i-9,j+8) * (-0.00018155410312273057) & - + in(i-8,j+8) * (-0.0002314814814814815) & - + in(i-7,j+8) * (-0.00030525030525030525) & - + in(i-6,j+8) * (-0.00042087542087542086) & - + in(i-5,j+8) * (-0.0006172839506172839) & - + in(i-4,j+8) * (-0.000992063492063492) & - + in(i-3,j+8) * (-0.001851851851851852) & - + in(i-2,j+8) * (-0.004629629629629629) & - + in(i-1,j+8) * (-0.027777777777777776) & - + in(i+1,j+8) * (0.0002314814814814815) & - + in(i+2,j+8) * (0.0002314814814814815) & - + in(i+3,j+8) * (0.0002314814814814815) & - + in(i+4,j+8) * (0.0002314814814814815) & - + in(i+5,j+8) * (0.0002314814814814815) & - + in(i+6,j+8) * (0.0002314814814814815) & - + in(i+7,j+8) * (0.0002314814814814815) & - + in(i+8,j+8) * (0.003472222222222222) & - + in(i+9,j+8) * (0.00018155410312273057) & - + in(i-9,j+9) * (-0.00018155410312273057) & - + in(i-8,j+9) * (-0.0002314814814814815) & - + in(i-7,j+9) * (-0.00030525030525030525) & - + in(i-6,j+9) * (-0.00042087542087542086) & - + in(i-5,j+9) * (-0.0006172839506172839) & - + in(i-4,j+9) * (-0.000992063492063492) & - + in(i-3,j+9) * (-0.001851851851851852) & - + in(i-2,j+9) * (-0.004629629629629629) & - + in(i-1,j+9) * (-0.027777777777777776) & - + in(i+1,j+9) * (0.00018155410312273057) & - + in(i+2,j+9) * (0.00018155410312273057) & - + in(i+3,j+9) * (0.00018155410312273057) & - + in(i+4,j+9) * (0.00018155410312273057) & - + in(i+5,j+9) * (0.00018155410312273057) & - + in(i+6,j+9) * (0.00018155410312273057) & - + in(i+7,j+9) * (0.00018155410312273057) & - + in(i+8,j+9) * (0.00018155410312273057) & - + in(i+9,j+9) * (0.0030864197530864196) & + + in(i-9,j-9) * (-0.0030864197530864196d0) & + + in(i+1,j-9) * (-0.00018155410312273057d0) & + + in(i+2,j-9) * (-0.00018155410312273057d0) & + + in(i+3,j-9) * (-0.00018155410312273057d0) & + + in(i+4,j-9) * (-0.00018155410312273057d0) & + + in(i+5,j-9) * (-0.00018155410312273057d0) & + + in(i+6,j-9) * (-0.00018155410312273057d0) & + + in(i+7,j-9) * (-0.00018155410312273057d0) & + + in(i+8,j-9) * (-0.00018155410312273057d0) & + + in(i+9,j-9) * (-0.00018155410312273057d0) & + + in(i-8,j-8) * (-0.003472222222222222d0) & + + in(i+1,j-8) * (-0.0002314814814814815d0) & + + in(i+2,j-8) * (-0.0002314814814814815d0) & + + in(i+3,j-8) * (-0.0002314814814814815d0) & + + in(i+4,j-8) * (-0.0002314814814814815d0) & + + in(i+5,j-8) * (-0.0002314814814814815d0) & + + in(i+6,j-8) * (-0.0002314814814814815d0) & + + in(i+7,j-8) * (-0.0002314814814814815d0) & + + in(i+8,j-8) * (-0.0002314814814814815d0) & + + in(i+9,j-8) * (-0.0002314814814814815d0) & + + in(i-7,j-7) * (-0.003968253968253968d0) & + + in(i+1,j-7) * (-0.00030525030525030525d0) & + + in(i+2,j-7) * (-0.00030525030525030525d0) & + + in(i+3,j-7) * (-0.00030525030525030525d0) & + + in(i+4,j-7) * (-0.00030525030525030525d0) & + + in(i+5,j-7) * (-0.00030525030525030525d0) & + + in(i+6,j-7) * (-0.00030525030525030525d0) & + + in(i+7,j-7) * (-0.00030525030525030525d0) & + + in(i+8,j-7) * (-0.00030525030525030525d0) & + + in(i+9,j-7) * (-0.00030525030525030525d0) & + + in(i-6,j-6) * (-0.004629629629629629d0) & + + in(i+1,j-6) * (-0.00042087542087542086d0) & + + in(i+2,j-6) * (-0.00042087542087542086d0) & + + in(i+3,j-6) * (-0.00042087542087542086d0) & + + in(i+4,j-6) * (-0.00042087542087542086d0) & + + in(i+5,j-6) * (-0.00042087542087542086d0) & + + in(i+6,j-6) * (-0.00042087542087542086d0) & + + in(i+7,j-6) * (-0.00042087542087542086d0) & + + in(i+8,j-6) * (-0.00042087542087542086d0) & + + in(i+9,j-6) * (-0.00042087542087542086d0) & + + in(i-5,j-5) * (-0.005555555555555556d0) & + + in(i+1,j-5) * (-0.0006172839506172839d0) & + + in(i+2,j-5) * (-0.0006172839506172839d0) & + + in(i+3,j-5) * (-0.0006172839506172839d0) & + + in(i+4,j-5) * (-0.0006172839506172839d0) & + + in(i+5,j-5) * (-0.0006172839506172839d0) & + + in(i+6,j-5) * (-0.0006172839506172839d0) & + + in(i+7,j-5) * (-0.0006172839506172839d0) & + + in(i+8,j-5) * (-0.0006172839506172839d0) & + + in(i+9,j-5) * (-0.0006172839506172839d0) & + + in(i-4,j-4) * (-0.006944444444444444d0) & + + in(i+1,j-4) * (-0.000992063492063492d0) & + + in(i+2,j-4) * (-0.000992063492063492d0) & + + in(i+3,j-4) * (-0.000992063492063492d0) & + + in(i+4,j-4) * (-0.000992063492063492d0) & + + in(i+5,j-4) * (-0.000992063492063492d0) & + + in(i+6,j-4) * (-0.000992063492063492d0) & + + in(i+7,j-4) * (-0.000992063492063492d0) & + + in(i+8,j-4) * (-0.000992063492063492d0) & + + in(i+9,j-4) * (-0.000992063492063492d0) & + + in(i-3,j-3) * (-0.009259259259259259d0) & + + in(i+1,j-3) * (-0.001851851851851852d0) & + + in(i+2,j-3) * (-0.001851851851851852d0) & + + in(i+3,j-3) * (-0.001851851851851852d0) & + + in(i+4,j-3) * (-0.001851851851851852d0) & + + in(i+5,j-3) * (-0.001851851851851852d0) & + + in(i+6,j-3) * (-0.001851851851851852d0) & + + in(i+7,j-3) * (-0.001851851851851852d0) & + + in(i+8,j-3) * (-0.001851851851851852d0) & + + in(i+9,j-3) * (-0.001851851851851852d0) & + + in(i-2,j-2) * (-0.013888888888888888d0) & + + in(i+1,j-2) * (-0.004629629629629629d0) & + + in(i+2,j-2) * (-0.004629629629629629d0) & + + in(i+3,j-2) * (-0.004629629629629629d0) & + + in(i+4,j-2) * (-0.004629629629629629d0) & + + in(i+5,j-2) * (-0.004629629629629629d0) & + + in(i+6,j-2) * (-0.004629629629629629d0) & + + in(i+7,j-2) * (-0.004629629629629629d0) & + + in(i+8,j-2) * (-0.004629629629629629d0) & + + in(i+9,j-2) * (-0.004629629629629629d0) & + + in(i-1,j-1) * (-0.027777777777777776d0) & + + in(i+1,j-1) * (-0.027777777777777776d0) & + + in(i+2,j-1) * (-0.027777777777777776d0) & + + in(i+3,j-1) * (-0.027777777777777776d0) & + + in(i+4,j-1) * (-0.027777777777777776d0) & + + in(i+5,j-1) * (-0.027777777777777776d0) & + + in(i+6,j-1) * (-0.027777777777777776d0) & + + in(i+7,j-1) * (-0.027777777777777776d0) & + + in(i+8,j-1) * (-0.027777777777777776d0) & + + in(i+9,j-1) * (-0.027777777777777776d0) & + + in(i-9,j+1) * (-0.00018155410312273057d0) & + + in(i-8,j+1) * (-0.0002314814814814815d0) & + + in(i-7,j+1) * (-0.00030525030525030525d0) & + + in(i-6,j+1) * (-0.00042087542087542086d0) & + + in(i-5,j+1) * (-0.0006172839506172839d0) & + + in(i-4,j+1) * (-0.000992063492063492d0) & + + in(i-3,j+1) * (-0.001851851851851852d0) & + + in(i-2,j+1) * (-0.004629629629629629d0) & + + in(i-1,j+1) * (-0.027777777777777776d0) & + + in(i+1,j+1) * (0.027777777777777776d0) & + + in(i+2,j+1) * (0.004629629629629629d0) & + + in(i+3,j+1) * (0.001851851851851852d0) & + + in(i+4,j+1) * (0.000992063492063492d0) & + + in(i+5,j+1) * (0.0006172839506172839d0) & + + in(i+6,j+1) * (0.00042087542087542086d0) & + + in(i+7,j+1) * (0.00030525030525030525d0) & + + in(i+8,j+1) * (0.0002314814814814815d0) & + + in(i+9,j+1) * (0.00018155410312273057d0) & + + in(i-9,j+2) * (-0.00018155410312273057d0) & + + in(i-8,j+2) * (-0.0002314814814814815d0) & + + in(i-7,j+2) * (-0.00030525030525030525d0) & + + in(i-6,j+2) * (-0.00042087542087542086d0) & + + in(i-5,j+2) * (-0.0006172839506172839d0) & + + in(i-4,j+2) * (-0.000992063492063492d0) & + + in(i-3,j+2) * (-0.001851851851851852d0) & + + in(i-2,j+2) * (-0.004629629629629629d0) & + + in(i-1,j+2) * (-0.027777777777777776d0) & + + in(i+1,j+2) * (0.004629629629629629d0) & + + in(i+2,j+2) * (0.013888888888888888d0) & + + in(i+3,j+2) * (0.001851851851851852d0) & + + in(i+4,j+2) * (0.000992063492063492d0) & + + in(i+5,j+2) * (0.0006172839506172839d0) & + + in(i+6,j+2) * (0.00042087542087542086d0) & + + in(i+7,j+2) * (0.00030525030525030525d0) & + + in(i+8,j+2) * (0.0002314814814814815d0) & + + in(i+9,j+2) * (0.00018155410312273057d0) & + + in(i-9,j+3) * (-0.00018155410312273057d0) & + + in(i-8,j+3) * (-0.0002314814814814815d0) & + + in(i-7,j+3) * (-0.00030525030525030525d0) & + + in(i-6,j+3) * (-0.00042087542087542086d0) & + + in(i-5,j+3) * (-0.0006172839506172839d0) & + + in(i-4,j+3) * (-0.000992063492063492d0) & + + in(i-3,j+3) * (-0.001851851851851852d0) & + + in(i-2,j+3) * (-0.004629629629629629d0) & + + in(i-1,j+3) * (-0.027777777777777776d0) & + + in(i+1,j+3) * (0.001851851851851852d0) & + + in(i+2,j+3) * (0.001851851851851852d0) & + + in(i+3,j+3) * (0.009259259259259259d0) & + + in(i+4,j+3) * (0.000992063492063492d0) & + + in(i+5,j+3) * (0.0006172839506172839d0) & + + in(i+6,j+3) * (0.00042087542087542086d0) & + + in(i+7,j+3) * (0.00030525030525030525d0) & + + in(i+8,j+3) * (0.0002314814814814815d0) & + + in(i+9,j+3) * (0.00018155410312273057d0) & + + in(i-9,j+4) * (-0.00018155410312273057d0) & + + in(i-8,j+4) * (-0.0002314814814814815d0) & + + in(i-7,j+4) * (-0.00030525030525030525d0) & + + in(i-6,j+4) * (-0.00042087542087542086d0) & + + in(i-5,j+4) * (-0.0006172839506172839d0) & + + in(i-4,j+4) * (-0.000992063492063492d0) & + + in(i-3,j+4) * (-0.001851851851851852d0) & + + in(i-2,j+4) * (-0.004629629629629629d0) & + + in(i-1,j+4) * (-0.027777777777777776d0) & + + in(i+1,j+4) * (0.000992063492063492d0) & + + in(i+2,j+4) * (0.000992063492063492d0) & + + in(i+3,j+4) * (0.000992063492063492d0) & + + in(i+4,j+4) * (0.006944444444444444d0) & + + in(i+5,j+4) * (0.0006172839506172839d0) & + + in(i+6,j+4) * (0.00042087542087542086d0) & + + in(i+7,j+4) * (0.00030525030525030525d0) & + + in(i+8,j+4) * (0.0002314814814814815d0) & + + in(i+9,j+4) * (0.00018155410312273057d0) & + + in(i-9,j+5) * (-0.00018155410312273057d0) & + + in(i-8,j+5) * (-0.0002314814814814815d0) & + + in(i-7,j+5) * (-0.00030525030525030525d0) & + + in(i-6,j+5) * (-0.00042087542087542086d0) & + + in(i-5,j+5) * (-0.0006172839506172839d0) & + + in(i-4,j+5) * (-0.000992063492063492d0) & + + in(i-3,j+5) * (-0.001851851851851852d0) & + + in(i-2,j+5) * (-0.004629629629629629d0) & + + in(i-1,j+5) * (-0.027777777777777776d0) & + + in(i+1,j+5) * (0.0006172839506172839d0) & + + in(i+2,j+5) * (0.0006172839506172839d0) & + + in(i+3,j+5) * (0.0006172839506172839d0) & + + in(i+4,j+5) * (0.0006172839506172839d0) & + + in(i+5,j+5) * (0.005555555555555556d0) & + + in(i+6,j+5) * (0.00042087542087542086d0) & + + in(i+7,j+5) * (0.00030525030525030525d0) & + + in(i+8,j+5) * (0.0002314814814814815d0) & + + in(i+9,j+5) * (0.00018155410312273057d0) & + + in(i-9,j+6) * (-0.00018155410312273057d0) & + + in(i-8,j+6) * (-0.0002314814814814815d0) & + + in(i-7,j+6) * (-0.00030525030525030525d0) & + + in(i-6,j+6) * (-0.00042087542087542086d0) & + + in(i-5,j+6) * (-0.0006172839506172839d0) & + + in(i-4,j+6) * (-0.000992063492063492d0) & + + in(i-3,j+6) * (-0.001851851851851852d0) & + + in(i-2,j+6) * (-0.004629629629629629d0) & + + in(i-1,j+6) * (-0.027777777777777776d0) & + + in(i+1,j+6) * (0.00042087542087542086d0) & + + in(i+2,j+6) * (0.00042087542087542086d0) & + + in(i+3,j+6) * (0.00042087542087542086d0) & + + in(i+4,j+6) * (0.00042087542087542086d0) & + + in(i+5,j+6) * (0.00042087542087542086d0) & + + in(i+6,j+6) * (0.004629629629629629d0) & + + in(i+7,j+6) * (0.00030525030525030525d0) & + + in(i+8,j+6) * (0.0002314814814814815d0) & + + in(i+9,j+6) * (0.00018155410312273057d0) & + + in(i-9,j+7) * (-0.00018155410312273057d0) & + + in(i-8,j+7) * (-0.0002314814814814815d0) & + + in(i-7,j+7) * (-0.00030525030525030525d0) & + + in(i-6,j+7) * (-0.00042087542087542086d0) & + + in(i-5,j+7) * (-0.0006172839506172839d0) & + + in(i-4,j+7) * (-0.000992063492063492d0) & + + in(i-3,j+7) * (-0.001851851851851852d0) & + + in(i-2,j+7) * (-0.004629629629629629d0) & + + in(i-1,j+7) * (-0.027777777777777776d0) & + + in(i+1,j+7) * (0.00030525030525030525d0) & + + in(i+2,j+7) * (0.00030525030525030525d0) & + + in(i+3,j+7) * (0.00030525030525030525d0) & + + in(i+4,j+7) * (0.00030525030525030525d0) & + + in(i+5,j+7) * (0.00030525030525030525d0) & + + in(i+6,j+7) * (0.00030525030525030525d0) & + + in(i+7,j+7) * (0.003968253968253968d0) & + + in(i+8,j+7) * (0.0002314814814814815d0) & + + in(i+9,j+7) * (0.00018155410312273057d0) & + + in(i-9,j+8) * (-0.00018155410312273057d0) & + + in(i-8,j+8) * (-0.0002314814814814815d0) & + + in(i-7,j+8) * (-0.00030525030525030525d0) & + + in(i-6,j+8) * (-0.00042087542087542086d0) & + + in(i-5,j+8) * (-0.0006172839506172839d0) & + + in(i-4,j+8) * (-0.000992063492063492d0) & + + in(i-3,j+8) * (-0.001851851851851852d0) & + + in(i-2,j+8) * (-0.004629629629629629d0) & + + in(i-1,j+8) * (-0.027777777777777776d0) & + + in(i+1,j+8) * (0.0002314814814814815d0) & + + in(i+2,j+8) * (0.0002314814814814815d0) & + + in(i+3,j+8) * (0.0002314814814814815d0) & + + in(i+4,j+8) * (0.0002314814814814815d0) & + + in(i+5,j+8) * (0.0002314814814814815d0) & + + in(i+6,j+8) * (0.0002314814814814815d0) & + + in(i+7,j+8) * (0.0002314814814814815d0) & + + in(i+8,j+8) * (0.003472222222222222d0) & + + in(i+9,j+8) * (0.00018155410312273057d0) & + + in(i-9,j+9) * (-0.00018155410312273057d0) & + + in(i-8,j+9) * (-0.0002314814814814815d0) & + + in(i-7,j+9) * (-0.00030525030525030525d0) & + + in(i-6,j+9) * (-0.00042087542087542086d0) & + + in(i-5,j+9) * (-0.0006172839506172839d0) & + + in(i-4,j+9) * (-0.000992063492063492d0) & + + in(i-3,j+9) * (-0.001851851851851852d0) & + + in(i-2,j+9) * (-0.004629629629629629d0) & + + in(i-1,j+9) * (-0.027777777777777776d0) & + + in(i+1,j+9) * (0.00018155410312273057d0) & + + in(i+2,j+9) * (0.00018155410312273057d0) & + + in(i+3,j+9) * (0.00018155410312273057d0) & + + in(i+4,j+9) * (0.00018155410312273057d0) & + + in(i+5,j+9) * (0.00018155410312273057d0) & + + in(i+6,j+9) * (0.00018155410312273057d0) & + + in(i+7,j+9) * (0.00018155410312273057d0) & + + in(i+8,j+9) * (0.00018155410312273057d0) & + + in(i+9,j+9) * (0.0030864197530864196d0) & +0.0 end do end do diff --git a/FORTRAN/stencil_target.f90 b/FORTRAN/stencil_target.f90 index 2f6edffe1..f2c3b7785 100644 --- a/FORTRAN/stencil_target.f90 +++ b/FORTRAN/stencil_target.f90 @@ -6,15 +6,14 @@ subroutine star1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=1,n-1-1 - !$omp simd do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i+0,j-1) * (-0.5) & - + in(i-1,j+0) * (-0.5) & - + in(i+1,j+0) * (0.5) & - + in(i+0,j+1) * (0.5) & + + in(i+0,j-1) * (-0.5d0) & + + in(i-1,j+0) * (-0.5d0) & + + in(i+1,j+0) * (0.5d0) & + + in(i+0,j+1) * (0.5d0) & +0.0 end do !$omp end simd @@ -30,19 +29,18 @@ subroutine star2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=2,n-2-1 - !$omp simd do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i+0,j-2) * (-0.125) & - + in(i+0,j-1) * (-0.25) & - + in(i-2,j+0) * (-0.125) & - + in(i-1,j+0) * (-0.25) & - + in(i+1,j+0) * (0.25) & - + in(i+2,j+0) * (0.125) & - + in(i+0,j+1) * (0.25) & - + in(i+0,j+2) * (0.125) & + + in(i+0,j-2) * (-0.125d0) & + + in(i+0,j-1) * (-0.25d0) & + + in(i-2,j+0) * (-0.125d0) & + + in(i-1,j+0) * (-0.25d0) & + + in(i+1,j+0) * (0.25d0) & + + in(i+2,j+0) * (0.125d0) & + + in(i+0,j+1) * (0.25d0) & + + in(i+0,j+2) * (0.125d0) & +0.0 end do !$omp end simd @@ -58,23 +56,22 @@ subroutine star3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=3,n-3-1 - !$omp simd do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i+0,j-3) * (-0.05555555555555555) & - + in(i+0,j-2) * (-0.08333333333333333) & - + in(i+0,j-1) * (-0.16666666666666666) & - + in(i-3,j+0) * (-0.05555555555555555) & - + in(i-2,j+0) * (-0.08333333333333333) & - + in(i-1,j+0) * (-0.16666666666666666) & - + in(i+1,j+0) * (0.16666666666666666) & - + in(i+2,j+0) * (0.08333333333333333) & - + in(i+3,j+0) * (0.05555555555555555) & - + in(i+0,j+1) * (0.16666666666666666) & - + in(i+0,j+2) * (0.08333333333333333) & - + in(i+0,j+3) * (0.05555555555555555) & + + in(i+0,j-3) * (-0.05555555555555555d0) & + + in(i+0,j-2) * (-0.08333333333333333d0) & + + in(i+0,j-1) * (-0.16666666666666666d0) & + + in(i-3,j+0) * (-0.05555555555555555d0) & + + in(i-2,j+0) * (-0.08333333333333333d0) & + + in(i-1,j+0) * (-0.16666666666666666d0) & + + in(i+1,j+0) * (0.16666666666666666d0) & + + in(i+2,j+0) * (0.08333333333333333d0) & + + in(i+3,j+0) * (0.05555555555555555d0) & + + in(i+0,j+1) * (0.16666666666666666d0) & + + in(i+0,j+2) * (0.08333333333333333d0) & + + in(i+0,j+3) * (0.05555555555555555d0) & +0.0 end do !$omp end simd @@ -90,27 +87,26 @@ subroutine star4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=4,n-4-1 - !$omp simd do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i+0,j-4) * (-0.03125) & - + in(i+0,j-3) * (-0.041666666666666664) & - + in(i+0,j-2) * (-0.0625) & - + in(i+0,j-1) * (-0.125) & - + in(i-4,j+0) * (-0.03125) & - + in(i-3,j+0) * (-0.041666666666666664) & - + in(i-2,j+0) * (-0.0625) & - + in(i-1,j+0) * (-0.125) & - + in(i+1,j+0) * (0.125) & - + in(i+2,j+0) * (0.0625) & - + in(i+3,j+0) * (0.041666666666666664) & - + in(i+4,j+0) * (0.03125) & - + in(i+0,j+1) * (0.125) & - + in(i+0,j+2) * (0.0625) & - + in(i+0,j+3) * (0.041666666666666664) & - + in(i+0,j+4) * (0.03125) & + + in(i+0,j-4) * (-0.03125d0) & + + in(i+0,j-3) * (-0.041666666666666664d0) & + + in(i+0,j-2) * (-0.0625d0) & + + in(i+0,j-1) * (-0.125d0) & + + in(i-4,j+0) * (-0.03125d0) & + + in(i-3,j+0) * (-0.041666666666666664d0) & + + in(i-2,j+0) * (-0.0625d0) & + + in(i-1,j+0) * (-0.125d0) & + + in(i+1,j+0) * (0.125d0) & + + in(i+2,j+0) * (0.0625d0) & + + in(i+3,j+0) * (0.041666666666666664d0) & + + in(i+4,j+0) * (0.03125d0) & + + in(i+0,j+1) * (0.125d0) & + + in(i+0,j+2) * (0.0625d0) & + + in(i+0,j+3) * (0.041666666666666664d0) & + + in(i+0,j+4) * (0.03125d0) & +0.0 end do !$omp end simd @@ -126,31 +122,30 @@ subroutine star5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=5,n-5-1 - !$omp simd do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i+0,j-5) * (-0.02) & - + in(i+0,j-4) * (-0.025) & - + in(i+0,j-3) * (-0.03333333333333333) & - + in(i+0,j-2) * (-0.05) & - + in(i+0,j-1) * (-0.1) & - + in(i-5,j+0) * (-0.02) & - + in(i-4,j+0) * (-0.025) & - + in(i-3,j+0) * (-0.03333333333333333) & - + in(i-2,j+0) * (-0.05) & - + in(i-1,j+0) * (-0.1) & - + in(i+1,j+0) * (0.1) & - + in(i+2,j+0) * (0.05) & - + in(i+3,j+0) * (0.03333333333333333) & - + in(i+4,j+0) * (0.025) & - + in(i+5,j+0) * (0.02) & - + in(i+0,j+1) * (0.1) & - + in(i+0,j+2) * (0.05) & - + in(i+0,j+3) * (0.03333333333333333) & - + in(i+0,j+4) * (0.025) & - + in(i+0,j+5) * (0.02) & + + in(i+0,j-5) * (-0.02d0) & + + in(i+0,j-4) * (-0.025d0) & + + in(i+0,j-3) * (-0.03333333333333333d0) & + + in(i+0,j-2) * (-0.05d0) & + + in(i+0,j-1) * (-0.1d0) & + + in(i-5,j+0) * (-0.02d0) & + + in(i-4,j+0) * (-0.025d0) & + + in(i-3,j+0) * (-0.03333333333333333d0) & + + in(i-2,j+0) * (-0.05d0) & + + in(i-1,j+0) * (-0.1d0) & + + in(i+1,j+0) * (0.1d0) & + + in(i+2,j+0) * (0.05d0) & + + in(i+3,j+0) * (0.03333333333333333d0) & + + in(i+4,j+0) * (0.025d0) & + + in(i+5,j+0) * (0.02d0) & + + in(i+0,j+1) * (0.1d0) & + + in(i+0,j+2) * (0.05d0) & + + in(i+0,j+3) * (0.03333333333333333d0) & + + in(i+0,j+4) * (0.025d0) & + + in(i+0,j+5) * (0.02d0) & +0.0 end do !$omp end simd @@ -166,35 +161,34 @@ subroutine star6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=6,n-6-1 - !$omp simd do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i+0,j-6) * (-0.013888888888888888) & - + in(i+0,j-5) * (-0.016666666666666666) & - + in(i+0,j-4) * (-0.020833333333333332) & - + in(i+0,j-3) * (-0.027777777777777776) & - + in(i+0,j-2) * (-0.041666666666666664) & - + in(i+0,j-1) * (-0.08333333333333333) & - + in(i-6,j+0) * (-0.013888888888888888) & - + in(i-5,j+0) * (-0.016666666666666666) & - + in(i-4,j+0) * (-0.020833333333333332) & - + in(i-3,j+0) * (-0.027777777777777776) & - + in(i-2,j+0) * (-0.041666666666666664) & - + in(i-1,j+0) * (-0.08333333333333333) & - + in(i+1,j+0) * (0.08333333333333333) & - + in(i+2,j+0) * (0.041666666666666664) & - + in(i+3,j+0) * (0.027777777777777776) & - + in(i+4,j+0) * (0.020833333333333332) & - + in(i+5,j+0) * (0.016666666666666666) & - + in(i+6,j+0) * (0.013888888888888888) & - + in(i+0,j+1) * (0.08333333333333333) & - + in(i+0,j+2) * (0.041666666666666664) & - + in(i+0,j+3) * (0.027777777777777776) & - + in(i+0,j+4) * (0.020833333333333332) & - + in(i+0,j+5) * (0.016666666666666666) & - + in(i+0,j+6) * (0.013888888888888888) & + + in(i+0,j-6) * (-0.013888888888888888d0) & + + in(i+0,j-5) * (-0.016666666666666666d0) & + + in(i+0,j-4) * (-0.020833333333333332d0) & + + in(i+0,j-3) * (-0.027777777777777776d0) & + + in(i+0,j-2) * (-0.041666666666666664d0) & + + in(i+0,j-1) * (-0.08333333333333333d0) & + + in(i-6,j+0) * (-0.013888888888888888d0) & + + in(i-5,j+0) * (-0.016666666666666666d0) & + + in(i-4,j+0) * (-0.020833333333333332d0) & + + in(i-3,j+0) * (-0.027777777777777776d0) & + + in(i-2,j+0) * (-0.041666666666666664d0) & + + in(i-1,j+0) * (-0.08333333333333333d0) & + + in(i+1,j+0) * (0.08333333333333333d0) & + + in(i+2,j+0) * (0.041666666666666664d0) & + + in(i+3,j+0) * (0.027777777777777776d0) & + + in(i+4,j+0) * (0.020833333333333332d0) & + + in(i+5,j+0) * (0.016666666666666666d0) & + + in(i+6,j+0) * (0.013888888888888888d0) & + + in(i+0,j+1) * (0.08333333333333333d0) & + + in(i+0,j+2) * (0.041666666666666664d0) & + + in(i+0,j+3) * (0.027777777777777776d0) & + + in(i+0,j+4) * (0.020833333333333332d0) & + + in(i+0,j+5) * (0.016666666666666666d0) & + + in(i+0,j+6) * (0.013888888888888888d0) & +0.0 end do !$omp end simd @@ -210,39 +204,38 @@ subroutine star7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=7,n-7-1 - !$omp simd do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i+0,j-7) * (-0.01020408163265306) & - + in(i+0,j-6) * (-0.011904761904761904) & - + in(i+0,j-5) * (-0.014285714285714285) & - + in(i+0,j-4) * (-0.017857142857142856) & - + in(i+0,j-3) * (-0.023809523809523808) & - + in(i+0,j-2) * (-0.03571428571428571) & - + in(i+0,j-1) * (-0.07142857142857142) & - + in(i-7,j+0) * (-0.01020408163265306) & - + in(i-6,j+0) * (-0.011904761904761904) & - + in(i-5,j+0) * (-0.014285714285714285) & - + in(i-4,j+0) * (-0.017857142857142856) & - + in(i-3,j+0) * (-0.023809523809523808) & - + in(i-2,j+0) * (-0.03571428571428571) & - + in(i-1,j+0) * (-0.07142857142857142) & - + in(i+1,j+0) * (0.07142857142857142) & - + in(i+2,j+0) * (0.03571428571428571) & - + in(i+3,j+0) * (0.023809523809523808) & - + in(i+4,j+0) * (0.017857142857142856) & - + in(i+5,j+0) * (0.014285714285714285) & - + in(i+6,j+0) * (0.011904761904761904) & - + in(i+7,j+0) * (0.01020408163265306) & - + in(i+0,j+1) * (0.07142857142857142) & - + in(i+0,j+2) * (0.03571428571428571) & - + in(i+0,j+3) * (0.023809523809523808) & - + in(i+0,j+4) * (0.017857142857142856) & - + in(i+0,j+5) * (0.014285714285714285) & - + in(i+0,j+6) * (0.011904761904761904) & - + in(i+0,j+7) * (0.01020408163265306) & + + in(i+0,j-7) * (-0.01020408163265306d0) & + + in(i+0,j-6) * (-0.011904761904761904d0) & + + in(i+0,j-5) * (-0.014285714285714285d0) & + + in(i+0,j-4) * (-0.017857142857142856d0) & + + in(i+0,j-3) * (-0.023809523809523808d0) & + + in(i+0,j-2) * (-0.03571428571428571d0) & + + in(i+0,j-1) * (-0.07142857142857142d0) & + + in(i-7,j+0) * (-0.01020408163265306d0) & + + in(i-6,j+0) * (-0.011904761904761904d0) & + + in(i-5,j+0) * (-0.014285714285714285d0) & + + in(i-4,j+0) * (-0.017857142857142856d0) & + + in(i-3,j+0) * (-0.023809523809523808d0) & + + in(i-2,j+0) * (-0.03571428571428571d0) & + + in(i-1,j+0) * (-0.07142857142857142d0) & + + in(i+1,j+0) * (0.07142857142857142d0) & + + in(i+2,j+0) * (0.03571428571428571d0) & + + in(i+3,j+0) * (0.023809523809523808d0) & + + in(i+4,j+0) * (0.017857142857142856d0) & + + in(i+5,j+0) * (0.014285714285714285d0) & + + in(i+6,j+0) * (0.011904761904761904d0) & + + in(i+7,j+0) * (0.01020408163265306d0) & + + in(i+0,j+1) * (0.07142857142857142d0) & + + in(i+0,j+2) * (0.03571428571428571d0) & + + in(i+0,j+3) * (0.023809523809523808d0) & + + in(i+0,j+4) * (0.017857142857142856d0) & + + in(i+0,j+5) * (0.014285714285714285d0) & + + in(i+0,j+6) * (0.011904761904761904d0) & + + in(i+0,j+7) * (0.01020408163265306d0) & +0.0 end do !$omp end simd @@ -258,43 +251,42 @@ subroutine star8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=8,n-8-1 - !$omp simd do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i+0,j-8) * (-0.0078125) & - + in(i+0,j-7) * (-0.008928571428571428) & - + in(i+0,j-6) * (-0.010416666666666666) & - + in(i+0,j-5) * (-0.0125) & - + in(i+0,j-4) * (-0.015625) & - + in(i+0,j-3) * (-0.020833333333333332) & - + in(i+0,j-2) * (-0.03125) & - + in(i+0,j-1) * (-0.0625) & - + in(i-8,j+0) * (-0.0078125) & - + in(i-7,j+0) * (-0.008928571428571428) & - + in(i-6,j+0) * (-0.010416666666666666) & - + in(i-5,j+0) * (-0.0125) & - + in(i-4,j+0) * (-0.015625) & - + in(i-3,j+0) * (-0.020833333333333332) & - + in(i-2,j+0) * (-0.03125) & - + in(i-1,j+0) * (-0.0625) & - + in(i+1,j+0) * (0.0625) & - + in(i+2,j+0) * (0.03125) & - + in(i+3,j+0) * (0.020833333333333332) & - + in(i+4,j+0) * (0.015625) & - + in(i+5,j+0) * (0.0125) & - + in(i+6,j+0) * (0.010416666666666666) & - + in(i+7,j+0) * (0.008928571428571428) & - + in(i+8,j+0) * (0.0078125) & - + in(i+0,j+1) * (0.0625) & - + in(i+0,j+2) * (0.03125) & - + in(i+0,j+3) * (0.020833333333333332) & - + in(i+0,j+4) * (0.015625) & - + in(i+0,j+5) * (0.0125) & - + in(i+0,j+6) * (0.010416666666666666) & - + in(i+0,j+7) * (0.008928571428571428) & - + in(i+0,j+8) * (0.0078125) & + + in(i+0,j-8) * (-0.0078125d0) & + + in(i+0,j-7) * (-0.008928571428571428d0) & + + in(i+0,j-6) * (-0.010416666666666666d0) & + + in(i+0,j-5) * (-0.0125d0) & + + in(i+0,j-4) * (-0.015625d0) & + + in(i+0,j-3) * (-0.020833333333333332d0) & + + in(i+0,j-2) * (-0.03125d0) & + + in(i+0,j-1) * (-0.0625d0) & + + in(i-8,j+0) * (-0.0078125d0) & + + in(i-7,j+0) * (-0.008928571428571428d0) & + + in(i-6,j+0) * (-0.010416666666666666d0) & + + in(i-5,j+0) * (-0.0125d0) & + + in(i-4,j+0) * (-0.015625d0) & + + in(i-3,j+0) * (-0.020833333333333332d0) & + + in(i-2,j+0) * (-0.03125d0) & + + in(i-1,j+0) * (-0.0625d0) & + + in(i+1,j+0) * (0.0625d0) & + + in(i+2,j+0) * (0.03125d0) & + + in(i+3,j+0) * (0.020833333333333332d0) & + + in(i+4,j+0) * (0.015625d0) & + + in(i+5,j+0) * (0.0125d0) & + + in(i+6,j+0) * (0.010416666666666666d0) & + + in(i+7,j+0) * (0.008928571428571428d0) & + + in(i+8,j+0) * (0.0078125d0) & + + in(i+0,j+1) * (0.0625d0) & + + in(i+0,j+2) * (0.03125d0) & + + in(i+0,j+3) * (0.020833333333333332d0) & + + in(i+0,j+4) * (0.015625d0) & + + in(i+0,j+5) * (0.0125d0) & + + in(i+0,j+6) * (0.010416666666666666d0) & + + in(i+0,j+7) * (0.008928571428571428d0) & + + in(i+0,j+8) * (0.0078125d0) & +0.0 end do !$omp end simd @@ -310,47 +302,46 @@ subroutine star9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=9,n-9-1 - !$omp simd do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i+0,j-9) * (-0.006172839506172839) & - + in(i+0,j-8) * (-0.006944444444444444) & - + in(i+0,j-7) * (-0.007936507936507936) & - + in(i+0,j-6) * (-0.009259259259259259) & - + in(i+0,j-5) * (-0.011111111111111112) & - + in(i+0,j-4) * (-0.013888888888888888) & - + in(i+0,j-3) * (-0.018518518518518517) & - + in(i+0,j-2) * (-0.027777777777777776) & - + in(i+0,j-1) * (-0.05555555555555555) & - + in(i-9,j+0) * (-0.006172839506172839) & - + in(i-8,j+0) * (-0.006944444444444444) & - + in(i-7,j+0) * (-0.007936507936507936) & - + in(i-6,j+0) * (-0.009259259259259259) & - + in(i-5,j+0) * (-0.011111111111111112) & - + in(i-4,j+0) * (-0.013888888888888888) & - + in(i-3,j+0) * (-0.018518518518518517) & - + in(i-2,j+0) * (-0.027777777777777776) & - + in(i-1,j+0) * (-0.05555555555555555) & - + in(i+1,j+0) * (0.05555555555555555) & - + in(i+2,j+0) * (0.027777777777777776) & - + in(i+3,j+0) * (0.018518518518518517) & - + in(i+4,j+0) * (0.013888888888888888) & - + in(i+5,j+0) * (0.011111111111111112) & - + in(i+6,j+0) * (0.009259259259259259) & - + in(i+7,j+0) * (0.007936507936507936) & - + in(i+8,j+0) * (0.006944444444444444) & - + in(i+9,j+0) * (0.006172839506172839) & - + in(i+0,j+1) * (0.05555555555555555) & - + in(i+0,j+2) * (0.027777777777777776) & - + in(i+0,j+3) * (0.018518518518518517) & - + in(i+0,j+4) * (0.013888888888888888) & - + in(i+0,j+5) * (0.011111111111111112) & - + in(i+0,j+6) * (0.009259259259259259) & - + in(i+0,j+7) * (0.007936507936507936) & - + in(i+0,j+8) * (0.006944444444444444) & - + in(i+0,j+9) * (0.006172839506172839) & + + in(i+0,j-9) * (-0.006172839506172839d0) & + + in(i+0,j-8) * (-0.006944444444444444d0) & + + in(i+0,j-7) * (-0.007936507936507936d0) & + + in(i+0,j-6) * (-0.009259259259259259d0) & + + in(i+0,j-5) * (-0.011111111111111112d0) & + + in(i+0,j-4) * (-0.013888888888888888d0) & + + in(i+0,j-3) * (-0.018518518518518517d0) & + + in(i+0,j-2) * (-0.027777777777777776d0) & + + in(i+0,j-1) * (-0.05555555555555555d0) & + + in(i-9,j+0) * (-0.006172839506172839d0) & + + in(i-8,j+0) * (-0.006944444444444444d0) & + + in(i-7,j+0) * (-0.007936507936507936d0) & + + in(i-6,j+0) * (-0.009259259259259259d0) & + + in(i-5,j+0) * (-0.011111111111111112d0) & + + in(i-4,j+0) * (-0.013888888888888888d0) & + + in(i-3,j+0) * (-0.018518518518518517d0) & + + in(i-2,j+0) * (-0.027777777777777776d0) & + + in(i-1,j+0) * (-0.05555555555555555d0) & + + in(i+1,j+0) * (0.05555555555555555d0) & + + in(i+2,j+0) * (0.027777777777777776d0) & + + in(i+3,j+0) * (0.018518518518518517d0) & + + in(i+4,j+0) * (0.013888888888888888d0) & + + in(i+5,j+0) * (0.011111111111111112d0) & + + in(i+6,j+0) * (0.009259259259259259d0) & + + in(i+7,j+0) * (0.007936507936507936d0) & + + in(i+8,j+0) * (0.006944444444444444d0) & + + in(i+9,j+0) * (0.006172839506172839d0) & + + in(i+0,j+1) * (0.05555555555555555d0) & + + in(i+0,j+2) * (0.027777777777777776d0) & + + in(i+0,j+3) * (0.018518518518518517d0) & + + in(i+0,j+4) * (0.013888888888888888d0) & + + in(i+0,j+5) * (0.011111111111111112d0) & + + in(i+0,j+6) * (0.009259259259259259d0) & + + in(i+0,j+7) * (0.007936507936507936d0) & + + in(i+0,j+8) * (0.006944444444444444d0) & + + in(i+0,j+9) * (0.006172839506172839d0) & +0.0 end do !$omp end simd @@ -366,15 +357,14 @@ subroutine grid1(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=1,n-1-1 - !$omp simd do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i-1,j-1) * (-0.25) & - + in(i+1,j-1) * (-0.25) & - + in(i-1,j+1) * (-0.25) & - + in(i+1,j+1) * (0.25) & + + in(i-1,j-1) * (-0.25d0) & + + in(i+1,j-1) * (-0.25d0) & + + in(i-1,j+1) * (-0.25d0) & + + in(i+1,j+1) * (0.25d0) & +0.0 end do !$omp end simd @@ -390,25 +380,24 @@ subroutine grid2(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=2,n-2-1 - !$omp simd do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i-2,j-2) * (-0.0625) & - + in(i+1,j-2) * (-0.020833333333333332) & - + in(i+2,j-2) * (-0.020833333333333332) & - + in(i-1,j-1) * (-0.125) & - + in(i+1,j-1) * (-0.125) & - + in(i+2,j-1) * (-0.125) & - + in(i-2,j+1) * (-0.020833333333333332) & - + in(i-1,j+1) * (-0.125) & - + in(i+1,j+1) * (0.125) & - + in(i+2,j+1) * (0.020833333333333332) & - + in(i-2,j+2) * (-0.020833333333333332) & - + in(i-1,j+2) * (-0.125) & - + in(i+1,j+2) * (0.020833333333333332) & - + in(i+2,j+2) * (0.0625) & + + in(i-2,j-2) * (-0.0625d0) & + + in(i+1,j-2) * (-0.020833333333333332d0) & + + in(i+2,j-2) * (-0.020833333333333332d0) & + + in(i-1,j-1) * (-0.125d0) & + + in(i+1,j-1) * (-0.125d0) & + + in(i+2,j-1) * (-0.125d0) & + + in(i-2,j+1) * (-0.020833333333333332d0) & + + in(i-1,j+1) * (-0.125d0) & + + in(i+1,j+1) * (0.125d0) & + + in(i+2,j+1) * (0.020833333333333332d0) & + + in(i-2,j+2) * (-0.020833333333333332d0) & + + in(i-1,j+2) * (-0.125d0) & + + in(i+1,j+2) * (0.020833333333333332d0) & + + in(i+2,j+2) * (0.0625d0) & +0.0 end do !$omp end simd @@ -424,41 +413,40 @@ subroutine grid3(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=3,n-3-1 - !$omp simd do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i-3,j-3) * (-0.027777777777777776) & - + in(i+1,j-3) * (-0.005555555555555556) & - + in(i+2,j-3) * (-0.005555555555555556) & - + in(i+3,j-3) * (-0.005555555555555556) & - + in(i-2,j-2) * (-0.041666666666666664) & - + in(i+1,j-2) * (-0.013888888888888888) & - + in(i+2,j-2) * (-0.013888888888888888) & - + in(i+3,j-2) * (-0.013888888888888888) & - + in(i-1,j-1) * (-0.08333333333333333) & - + in(i+1,j-1) * (-0.08333333333333333) & - + in(i+2,j-1) * (-0.08333333333333333) & - + in(i+3,j-1) * (-0.08333333333333333) & - + in(i-3,j+1) * (-0.005555555555555556) & - + in(i-2,j+1) * (-0.013888888888888888) & - + in(i-1,j+1) * (-0.08333333333333333) & - + in(i+1,j+1) * (0.08333333333333333) & - + in(i+2,j+1) * (0.013888888888888888) & - + in(i+3,j+1) * (0.005555555555555556) & - + in(i-3,j+2) * (-0.005555555555555556) & - + in(i-2,j+2) * (-0.013888888888888888) & - + in(i-1,j+2) * (-0.08333333333333333) & - + in(i+1,j+2) * (0.013888888888888888) & - + in(i+2,j+2) * (0.041666666666666664) & - + in(i+3,j+2) * (0.005555555555555556) & - + in(i-3,j+3) * (-0.005555555555555556) & - + in(i-2,j+3) * (-0.013888888888888888) & - + in(i-1,j+3) * (-0.08333333333333333) & - + in(i+1,j+3) * (0.005555555555555556) & - + in(i+2,j+3) * (0.005555555555555556) & - + in(i+3,j+3) * (0.027777777777777776) & + + in(i-3,j-3) * (-0.027777777777777776d0) & + + in(i+1,j-3) * (-0.005555555555555556d0) & + + in(i+2,j-3) * (-0.005555555555555556d0) & + + in(i+3,j-3) * (-0.005555555555555556d0) & + + in(i-2,j-2) * (-0.041666666666666664d0) & + + in(i+1,j-2) * (-0.013888888888888888d0) & + + in(i+2,j-2) * (-0.013888888888888888d0) & + + in(i+3,j-2) * (-0.013888888888888888d0) & + + in(i-1,j-1) * (-0.08333333333333333d0) & + + in(i+1,j-1) * (-0.08333333333333333d0) & + + in(i+2,j-1) * (-0.08333333333333333d0) & + + in(i+3,j-1) * (-0.08333333333333333d0) & + + in(i-3,j+1) * (-0.005555555555555556d0) & + + in(i-2,j+1) * (-0.013888888888888888d0) & + + in(i-1,j+1) * (-0.08333333333333333d0) & + + in(i+1,j+1) * (0.08333333333333333d0) & + + in(i+2,j+1) * (0.013888888888888888d0) & + + in(i+3,j+1) * (0.005555555555555556d0) & + + in(i-3,j+2) * (-0.005555555555555556d0) & + + in(i-2,j+2) * (-0.013888888888888888d0) & + + in(i-1,j+2) * (-0.08333333333333333d0) & + + in(i+1,j+2) * (0.013888888888888888d0) & + + in(i+2,j+2) * (0.041666666666666664d0) & + + in(i+3,j+2) * (0.005555555555555556d0) & + + in(i-3,j+3) * (-0.005555555555555556d0) & + + in(i-2,j+3) * (-0.013888888888888888d0) & + + in(i-1,j+3) * (-0.08333333333333333d0) & + + in(i+1,j+3) * (0.005555555555555556d0) & + + in(i+2,j+3) * (0.005555555555555556d0) & + + in(i+3,j+3) * (0.027777777777777776d0) & +0.0 end do !$omp end simd @@ -474,63 +462,62 @@ subroutine grid4(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=4,n-4-1 - !$omp simd do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i-4,j-4) * (-0.015625) & - + in(i+1,j-4) * (-0.002232142857142857) & - + in(i+2,j-4) * (-0.002232142857142857) & - + in(i+3,j-4) * (-0.002232142857142857) & - + in(i+4,j-4) * (-0.002232142857142857) & - + in(i-3,j-3) * (-0.020833333333333332) & - + in(i+1,j-3) * (-0.004166666666666667) & - + in(i+2,j-3) * (-0.004166666666666667) & - + in(i+3,j-3) * (-0.004166666666666667) & - + in(i+4,j-3) * (-0.004166666666666667) & - + in(i-2,j-2) * (-0.03125) & - + in(i+1,j-2) * (-0.010416666666666666) & - + in(i+2,j-2) * (-0.010416666666666666) & - + in(i+3,j-2) * (-0.010416666666666666) & - + in(i+4,j-2) * (-0.010416666666666666) & - + in(i-1,j-1) * (-0.0625) & - + in(i+1,j-1) * (-0.0625) & - + in(i+2,j-1) * (-0.0625) & - + in(i+3,j-1) * (-0.0625) & - + in(i+4,j-1) * (-0.0625) & - + in(i-4,j+1) * (-0.002232142857142857) & - + in(i-3,j+1) * (-0.004166666666666667) & - + in(i-2,j+1) * (-0.010416666666666666) & - + in(i-1,j+1) * (-0.0625) & - + in(i+1,j+1) * (0.0625) & - + in(i+2,j+1) * (0.010416666666666666) & - + in(i+3,j+1) * (0.004166666666666667) & - + in(i+4,j+1) * (0.002232142857142857) & - + in(i-4,j+2) * (-0.002232142857142857) & - + in(i-3,j+2) * (-0.004166666666666667) & - + in(i-2,j+2) * (-0.010416666666666666) & - + in(i-1,j+2) * (-0.0625) & - + in(i+1,j+2) * (0.010416666666666666) & - + in(i+2,j+2) * (0.03125) & - + in(i+3,j+2) * (0.004166666666666667) & - + in(i+4,j+2) * (0.002232142857142857) & - + in(i-4,j+3) * (-0.002232142857142857) & - + in(i-3,j+3) * (-0.004166666666666667) & - + in(i-2,j+3) * (-0.010416666666666666) & - + in(i-1,j+3) * (-0.0625) & - + in(i+1,j+3) * (0.004166666666666667) & - + in(i+2,j+3) * (0.004166666666666667) & - + in(i+3,j+3) * (0.020833333333333332) & - + in(i+4,j+3) * (0.002232142857142857) & - + in(i-4,j+4) * (-0.002232142857142857) & - + in(i-3,j+4) * (-0.004166666666666667) & - + in(i-2,j+4) * (-0.010416666666666666) & - + in(i-1,j+4) * (-0.0625) & - + in(i+1,j+4) * (0.002232142857142857) & - + in(i+2,j+4) * (0.002232142857142857) & - + in(i+3,j+4) * (0.002232142857142857) & - + in(i+4,j+4) * (0.015625) & + + in(i-4,j-4) * (-0.015625d0) & + + in(i+1,j-4) * (-0.002232142857142857d0) & + + in(i+2,j-4) * (-0.002232142857142857d0) & + + in(i+3,j-4) * (-0.002232142857142857d0) & + + in(i+4,j-4) * (-0.002232142857142857d0) & + + in(i-3,j-3) * (-0.020833333333333332d0) & + + in(i+1,j-3) * (-0.004166666666666667d0) & + + in(i+2,j-3) * (-0.004166666666666667d0) & + + in(i+3,j-3) * (-0.004166666666666667d0) & + + in(i+4,j-3) * (-0.004166666666666667d0) & + + in(i-2,j-2) * (-0.03125d0) & + + in(i+1,j-2) * (-0.010416666666666666d0) & + + in(i+2,j-2) * (-0.010416666666666666d0) & + + in(i+3,j-2) * (-0.010416666666666666d0) & + + in(i+4,j-2) * (-0.010416666666666666d0) & + + in(i-1,j-1) * (-0.0625d0) & + + in(i+1,j-1) * (-0.0625d0) & + + in(i+2,j-1) * (-0.0625d0) & + + in(i+3,j-1) * (-0.0625d0) & + + in(i+4,j-1) * (-0.0625d0) & + + in(i-4,j+1) * (-0.002232142857142857d0) & + + in(i-3,j+1) * (-0.004166666666666667d0) & + + in(i-2,j+1) * (-0.010416666666666666d0) & + + in(i-1,j+1) * (-0.0625d0) & + + in(i+1,j+1) * (0.0625d0) & + + in(i+2,j+1) * (0.010416666666666666d0) & + + in(i+3,j+1) * (0.004166666666666667d0) & + + in(i+4,j+1) * (0.002232142857142857d0) & + + in(i-4,j+2) * (-0.002232142857142857d0) & + + in(i-3,j+2) * (-0.004166666666666667d0) & + + in(i-2,j+2) * (-0.010416666666666666d0) & + + in(i-1,j+2) * (-0.0625d0) & + + in(i+1,j+2) * (0.010416666666666666d0) & + + in(i+2,j+2) * (0.03125d0) & + + in(i+3,j+2) * (0.004166666666666667d0) & + + in(i+4,j+2) * (0.002232142857142857d0) & + + in(i-4,j+3) * (-0.002232142857142857d0) & + + in(i-3,j+3) * (-0.004166666666666667d0) & + + in(i-2,j+3) * (-0.010416666666666666d0) & + + in(i-1,j+3) * (-0.0625d0) & + + in(i+1,j+3) * (0.004166666666666667d0) & + + in(i+2,j+3) * (0.004166666666666667d0) & + + in(i+3,j+3) * (0.020833333333333332d0) & + + in(i+4,j+3) * (0.002232142857142857d0) & + + in(i-4,j+4) * (-0.002232142857142857d0) & + + in(i-3,j+4) * (-0.004166666666666667d0) & + + in(i-2,j+4) * (-0.010416666666666666d0) & + + in(i-1,j+4) * (-0.0625d0) & + + in(i+1,j+4) * (0.002232142857142857d0) & + + in(i+2,j+4) * (0.002232142857142857d0) & + + in(i+3,j+4) * (0.002232142857142857d0) & + + in(i+4,j+4) * (0.015625d0) & +0.0 end do !$omp end simd @@ -546,91 +533,90 @@ subroutine grid5(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=5,n-5-1 - !$omp simd do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i-5,j-5) * (-0.01) & - + in(i+1,j-5) * (-0.0011111111111111111) & - + in(i+2,j-5) * (-0.0011111111111111111) & - + in(i+3,j-5) * (-0.0011111111111111111) & - + in(i+4,j-5) * (-0.0011111111111111111) & - + in(i+5,j-5) * (-0.0011111111111111111) & - + in(i-4,j-4) * (-0.0125) & - + in(i+1,j-4) * (-0.0017857142857142857) & - + in(i+2,j-4) * (-0.0017857142857142857) & - + in(i+3,j-4) * (-0.0017857142857142857) & - + in(i+4,j-4) * (-0.0017857142857142857) & - + in(i+5,j-4) * (-0.0017857142857142857) & - + in(i-3,j-3) * (-0.016666666666666666) & - + in(i+1,j-3) * (-0.0033333333333333335) & - + in(i+2,j-3) * (-0.0033333333333333335) & - + in(i+3,j-3) * (-0.0033333333333333335) & - + in(i+4,j-3) * (-0.0033333333333333335) & - + in(i+5,j-3) * (-0.0033333333333333335) & - + in(i-2,j-2) * (-0.025) & - + in(i+1,j-2) * (-0.008333333333333333) & - + in(i+2,j-2) * (-0.008333333333333333) & - + in(i+3,j-2) * (-0.008333333333333333) & - + in(i+4,j-2) * (-0.008333333333333333) & - + in(i+5,j-2) * (-0.008333333333333333) & - + in(i-1,j-1) * (-0.05) & - + in(i+1,j-1) * (-0.05) & - + in(i+2,j-1) * (-0.05) & - + in(i+3,j-1) * (-0.05) & - + in(i+4,j-1) * (-0.05) & - + in(i+5,j-1) * (-0.05) & - + in(i-5,j+1) * (-0.0011111111111111111) & - + in(i-4,j+1) * (-0.0017857142857142857) & - + in(i-3,j+1) * (-0.0033333333333333335) & - + in(i-2,j+1) * (-0.008333333333333333) & - + in(i-1,j+1) * (-0.05) & - + in(i+1,j+1) * (0.05) & - + in(i+2,j+1) * (0.008333333333333333) & - + in(i+3,j+1) * (0.0033333333333333335) & - + in(i+4,j+1) * (0.0017857142857142857) & - + in(i+5,j+1) * (0.0011111111111111111) & - + in(i-5,j+2) * (-0.0011111111111111111) & - + in(i-4,j+2) * (-0.0017857142857142857) & - + in(i-3,j+2) * (-0.0033333333333333335) & - + in(i-2,j+2) * (-0.008333333333333333) & - + in(i-1,j+2) * (-0.05) & - + in(i+1,j+2) * (0.008333333333333333) & - + in(i+2,j+2) * (0.025) & - + in(i+3,j+2) * (0.0033333333333333335) & - + in(i+4,j+2) * (0.0017857142857142857) & - + in(i+5,j+2) * (0.0011111111111111111) & - + in(i-5,j+3) * (-0.0011111111111111111) & - + in(i-4,j+3) * (-0.0017857142857142857) & - + in(i-3,j+3) * (-0.0033333333333333335) & - + in(i-2,j+3) * (-0.008333333333333333) & - + in(i-1,j+3) * (-0.05) & - + in(i+1,j+3) * (0.0033333333333333335) & - + in(i+2,j+3) * (0.0033333333333333335) & - + in(i+3,j+3) * (0.016666666666666666) & - + in(i+4,j+3) * (0.0017857142857142857) & - + in(i+5,j+3) * (0.0011111111111111111) & - + in(i-5,j+4) * (-0.0011111111111111111) & - + in(i-4,j+4) * (-0.0017857142857142857) & - + in(i-3,j+4) * (-0.0033333333333333335) & - + in(i-2,j+4) * (-0.008333333333333333) & - + in(i-1,j+4) * (-0.05) & - + in(i+1,j+4) * (0.0017857142857142857) & - + in(i+2,j+4) * (0.0017857142857142857) & - + in(i+3,j+4) * (0.0017857142857142857) & - + in(i+4,j+4) * (0.0125) & - + in(i+5,j+4) * (0.0011111111111111111) & - + in(i-5,j+5) * (-0.0011111111111111111) & - + in(i-4,j+5) * (-0.0017857142857142857) & - + in(i-3,j+5) * (-0.0033333333333333335) & - + in(i-2,j+5) * (-0.008333333333333333) & - + in(i-1,j+5) * (-0.05) & - + in(i+1,j+5) * (0.0011111111111111111) & - + in(i+2,j+5) * (0.0011111111111111111) & - + in(i+3,j+5) * (0.0011111111111111111) & - + in(i+4,j+5) * (0.0011111111111111111) & - + in(i+5,j+5) * (0.01) & + + in(i-5,j-5) * (-0.01d0) & + + in(i+1,j-5) * (-0.0011111111111111111d0) & + + in(i+2,j-5) * (-0.0011111111111111111d0) & + + in(i+3,j-5) * (-0.0011111111111111111d0) & + + in(i+4,j-5) * (-0.0011111111111111111d0) & + + in(i+5,j-5) * (-0.0011111111111111111d0) & + + in(i-4,j-4) * (-0.0125d0) & + + in(i+1,j-4) * (-0.0017857142857142857d0) & + + in(i+2,j-4) * (-0.0017857142857142857d0) & + + in(i+3,j-4) * (-0.0017857142857142857d0) & + + in(i+4,j-4) * (-0.0017857142857142857d0) & + + in(i+5,j-4) * (-0.0017857142857142857d0) & + + in(i-3,j-3) * (-0.016666666666666666d0) & + + in(i+1,j-3) * (-0.0033333333333333335d0) & + + in(i+2,j-3) * (-0.0033333333333333335d0) & + + in(i+3,j-3) * (-0.0033333333333333335d0) & + + in(i+4,j-3) * (-0.0033333333333333335d0) & + + in(i+5,j-3) * (-0.0033333333333333335d0) & + + in(i-2,j-2) * (-0.025d0) & + + in(i+1,j-2) * (-0.008333333333333333d0) & + + in(i+2,j-2) * (-0.008333333333333333d0) & + + in(i+3,j-2) * (-0.008333333333333333d0) & + + in(i+4,j-2) * (-0.008333333333333333d0) & + + in(i+5,j-2) * (-0.008333333333333333d0) & + + in(i-1,j-1) * (-0.05d0) & + + in(i+1,j-1) * (-0.05d0) & + + in(i+2,j-1) * (-0.05d0) & + + in(i+3,j-1) * (-0.05d0) & + + in(i+4,j-1) * (-0.05d0) & + + in(i+5,j-1) * (-0.05d0) & + + in(i-5,j+1) * (-0.0011111111111111111d0) & + + in(i-4,j+1) * (-0.0017857142857142857d0) & + + in(i-3,j+1) * (-0.0033333333333333335d0) & + + in(i-2,j+1) * (-0.008333333333333333d0) & + + in(i-1,j+1) * (-0.05d0) & + + in(i+1,j+1) * (0.05d0) & + + in(i+2,j+1) * (0.008333333333333333d0) & + + in(i+3,j+1) * (0.0033333333333333335d0) & + + in(i+4,j+1) * (0.0017857142857142857d0) & + + in(i+5,j+1) * (0.0011111111111111111d0) & + + in(i-5,j+2) * (-0.0011111111111111111d0) & + + in(i-4,j+2) * (-0.0017857142857142857d0) & + + in(i-3,j+2) * (-0.0033333333333333335d0) & + + in(i-2,j+2) * (-0.008333333333333333d0) & + + in(i-1,j+2) * (-0.05d0) & + + in(i+1,j+2) * (0.008333333333333333d0) & + + in(i+2,j+2) * (0.025d0) & + + in(i+3,j+2) * (0.0033333333333333335d0) & + + in(i+4,j+2) * (0.0017857142857142857d0) & + + in(i+5,j+2) * (0.0011111111111111111d0) & + + in(i-5,j+3) * (-0.0011111111111111111d0) & + + in(i-4,j+3) * (-0.0017857142857142857d0) & + + in(i-3,j+3) * (-0.0033333333333333335d0) & + + in(i-2,j+3) * (-0.008333333333333333d0) & + + in(i-1,j+3) * (-0.05d0) & + + in(i+1,j+3) * (0.0033333333333333335d0) & + + in(i+2,j+3) * (0.0033333333333333335d0) & + + in(i+3,j+3) * (0.016666666666666666d0) & + + in(i+4,j+3) * (0.0017857142857142857d0) & + + in(i+5,j+3) * (0.0011111111111111111d0) & + + in(i-5,j+4) * (-0.0011111111111111111d0) & + + in(i-4,j+4) * (-0.0017857142857142857d0) & + + in(i-3,j+4) * (-0.0033333333333333335d0) & + + in(i-2,j+4) * (-0.008333333333333333d0) & + + in(i-1,j+4) * (-0.05d0) & + + in(i+1,j+4) * (0.0017857142857142857d0) & + + in(i+2,j+4) * (0.0017857142857142857d0) & + + in(i+3,j+4) * (0.0017857142857142857d0) & + + in(i+4,j+4) * (0.0125d0) & + + in(i+5,j+4) * (0.0011111111111111111d0) & + + in(i-5,j+5) * (-0.0011111111111111111d0) & + + in(i-4,j+5) * (-0.0017857142857142857d0) & + + in(i-3,j+5) * (-0.0033333333333333335d0) & + + in(i-2,j+5) * (-0.008333333333333333d0) & + + in(i-1,j+5) * (-0.05d0) & + + in(i+1,j+5) * (0.0011111111111111111d0) & + + in(i+2,j+5) * (0.0011111111111111111d0) & + + in(i+3,j+5) * (0.0011111111111111111d0) & + + in(i+4,j+5) * (0.0011111111111111111d0) & + + in(i+5,j+5) * (0.01d0) & +0.0 end do !$omp end simd @@ -646,125 +632,124 @@ subroutine grid6(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=6,n-6-1 - !$omp simd do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i-6,j-6) * (-0.006944444444444444) & - + in(i+1,j-6) * (-0.0006313131313131314) & - + in(i+2,j-6) * (-0.0006313131313131314) & - + in(i+3,j-6) * (-0.0006313131313131314) & - + in(i+4,j-6) * (-0.0006313131313131314) & - + in(i+5,j-6) * (-0.0006313131313131314) & - + in(i+6,j-6) * (-0.0006313131313131314) & - + in(i-5,j-5) * (-0.008333333333333333) & - + in(i+1,j-5) * (-0.000925925925925926) & - + in(i+2,j-5) * (-0.000925925925925926) & - + in(i+3,j-5) * (-0.000925925925925926) & - + in(i+4,j-5) * (-0.000925925925925926) & - + in(i+5,j-5) * (-0.000925925925925926) & - + in(i+6,j-5) * (-0.000925925925925926) & - + in(i-4,j-4) * (-0.010416666666666666) & - + in(i+1,j-4) * (-0.001488095238095238) & - + in(i+2,j-4) * (-0.001488095238095238) & - + in(i+3,j-4) * (-0.001488095238095238) & - + in(i+4,j-4) * (-0.001488095238095238) & - + in(i+5,j-4) * (-0.001488095238095238) & - + in(i+6,j-4) * (-0.001488095238095238) & - + in(i-3,j-3) * (-0.013888888888888888) & - + in(i+1,j-3) * (-0.002777777777777778) & - + in(i+2,j-3) * (-0.002777777777777778) & - + in(i+3,j-3) * (-0.002777777777777778) & - + in(i+4,j-3) * (-0.002777777777777778) & - + in(i+5,j-3) * (-0.002777777777777778) & - + in(i+6,j-3) * (-0.002777777777777778) & - + in(i-2,j-2) * (-0.020833333333333332) & - + in(i+1,j-2) * (-0.006944444444444444) & - + in(i+2,j-2) * (-0.006944444444444444) & - + in(i+3,j-2) * (-0.006944444444444444) & - + in(i+4,j-2) * (-0.006944444444444444) & - + in(i+5,j-2) * (-0.006944444444444444) & - + in(i+6,j-2) * (-0.006944444444444444) & - + in(i-1,j-1) * (-0.041666666666666664) & - + in(i+1,j-1) * (-0.041666666666666664) & - + in(i+2,j-1) * (-0.041666666666666664) & - + in(i+3,j-1) * (-0.041666666666666664) & - + in(i+4,j-1) * (-0.041666666666666664) & - + in(i+5,j-1) * (-0.041666666666666664) & - + in(i+6,j-1) * (-0.041666666666666664) & - + in(i-6,j+1) * (-0.0006313131313131314) & - + in(i-5,j+1) * (-0.000925925925925926) & - + in(i-4,j+1) * (-0.001488095238095238) & - + in(i-3,j+1) * (-0.002777777777777778) & - + in(i-2,j+1) * (-0.006944444444444444) & - + in(i-1,j+1) * (-0.041666666666666664) & - + in(i+1,j+1) * (0.041666666666666664) & - + in(i+2,j+1) * (0.006944444444444444) & - + in(i+3,j+1) * (0.002777777777777778) & - + in(i+4,j+1) * (0.001488095238095238) & - + in(i+5,j+1) * (0.000925925925925926) & - + in(i+6,j+1) * (0.0006313131313131314) & - + in(i-6,j+2) * (-0.0006313131313131314) & - + in(i-5,j+2) * (-0.000925925925925926) & - + in(i-4,j+2) * (-0.001488095238095238) & - + in(i-3,j+2) * (-0.002777777777777778) & - + in(i-2,j+2) * (-0.006944444444444444) & - + in(i-1,j+2) * (-0.041666666666666664) & - + in(i+1,j+2) * (0.006944444444444444) & - + in(i+2,j+2) * (0.020833333333333332) & - + in(i+3,j+2) * (0.002777777777777778) & - + in(i+4,j+2) * (0.001488095238095238) & - + in(i+5,j+2) * (0.000925925925925926) & - + in(i+6,j+2) * (0.0006313131313131314) & - + in(i-6,j+3) * (-0.0006313131313131314) & - + in(i-5,j+3) * (-0.000925925925925926) & - + in(i-4,j+3) * (-0.001488095238095238) & - + in(i-3,j+3) * (-0.002777777777777778) & - + in(i-2,j+3) * (-0.006944444444444444) & - + in(i-1,j+3) * (-0.041666666666666664) & - + in(i+1,j+3) * (0.002777777777777778) & - + in(i+2,j+3) * (0.002777777777777778) & - + in(i+3,j+3) * (0.013888888888888888) & - + in(i+4,j+3) * (0.001488095238095238) & - + in(i+5,j+3) * (0.000925925925925926) & - + in(i+6,j+3) * (0.0006313131313131314) & - + in(i-6,j+4) * (-0.0006313131313131314) & - + in(i-5,j+4) * (-0.000925925925925926) & - + in(i-4,j+4) * (-0.001488095238095238) & - + in(i-3,j+4) * (-0.002777777777777778) & - + in(i-2,j+4) * (-0.006944444444444444) & - + in(i-1,j+4) * (-0.041666666666666664) & - + in(i+1,j+4) * (0.001488095238095238) & - + in(i+2,j+4) * (0.001488095238095238) & - + in(i+3,j+4) * (0.001488095238095238) & - + in(i+4,j+4) * (0.010416666666666666) & - + in(i+5,j+4) * (0.000925925925925926) & - + in(i+6,j+4) * (0.0006313131313131314) & - + in(i-6,j+5) * (-0.0006313131313131314) & - + in(i-5,j+5) * (-0.000925925925925926) & - + in(i-4,j+5) * (-0.001488095238095238) & - + in(i-3,j+5) * (-0.002777777777777778) & - + in(i-2,j+5) * (-0.006944444444444444) & - + in(i-1,j+5) * (-0.041666666666666664) & - + in(i+1,j+5) * (0.000925925925925926) & - + in(i+2,j+5) * (0.000925925925925926) & - + in(i+3,j+5) * (0.000925925925925926) & - + in(i+4,j+5) * (0.000925925925925926) & - + in(i+5,j+5) * (0.008333333333333333) & - + in(i+6,j+5) * (0.0006313131313131314) & - + in(i-6,j+6) * (-0.0006313131313131314) & - + in(i-5,j+6) * (-0.000925925925925926) & - + in(i-4,j+6) * (-0.001488095238095238) & - + in(i-3,j+6) * (-0.002777777777777778) & - + in(i-2,j+6) * (-0.006944444444444444) & - + in(i-1,j+6) * (-0.041666666666666664) & - + in(i+1,j+6) * (0.0006313131313131314) & - + in(i+2,j+6) * (0.0006313131313131314) & - + in(i+3,j+6) * (0.0006313131313131314) & - + in(i+4,j+6) * (0.0006313131313131314) & - + in(i+5,j+6) * (0.0006313131313131314) & - + in(i+6,j+6) * (0.006944444444444444) & + + in(i-6,j-6) * (-0.006944444444444444d0) & + + in(i+1,j-6) * (-0.0006313131313131314d0) & + + in(i+2,j-6) * (-0.0006313131313131314d0) & + + in(i+3,j-6) * (-0.0006313131313131314d0) & + + in(i+4,j-6) * (-0.0006313131313131314d0) & + + in(i+5,j-6) * (-0.0006313131313131314d0) & + + in(i+6,j-6) * (-0.0006313131313131314d0) & + + in(i-5,j-5) * (-0.008333333333333333d0) & + + in(i+1,j-5) * (-0.000925925925925926d0) & + + in(i+2,j-5) * (-0.000925925925925926d0) & + + in(i+3,j-5) * (-0.000925925925925926d0) & + + in(i+4,j-5) * (-0.000925925925925926d0) & + + in(i+5,j-5) * (-0.000925925925925926d0) & + + in(i+6,j-5) * (-0.000925925925925926d0) & + + in(i-4,j-4) * (-0.010416666666666666d0) & + + in(i+1,j-4) * (-0.001488095238095238d0) & + + in(i+2,j-4) * (-0.001488095238095238d0) & + + in(i+3,j-4) * (-0.001488095238095238d0) & + + in(i+4,j-4) * (-0.001488095238095238d0) & + + in(i+5,j-4) * (-0.001488095238095238d0) & + + in(i+6,j-4) * (-0.001488095238095238d0) & + + in(i-3,j-3) * (-0.013888888888888888d0) & + + in(i+1,j-3) * (-0.002777777777777778d0) & + + in(i+2,j-3) * (-0.002777777777777778d0) & + + in(i+3,j-3) * (-0.002777777777777778d0) & + + in(i+4,j-3) * (-0.002777777777777778d0) & + + in(i+5,j-3) * (-0.002777777777777778d0) & + + in(i+6,j-3) * (-0.002777777777777778d0) & + + in(i-2,j-2) * (-0.020833333333333332d0) & + + in(i+1,j-2) * (-0.006944444444444444d0) & + + in(i+2,j-2) * (-0.006944444444444444d0) & + + in(i+3,j-2) * (-0.006944444444444444d0) & + + in(i+4,j-2) * (-0.006944444444444444d0) & + + in(i+5,j-2) * (-0.006944444444444444d0) & + + in(i+6,j-2) * (-0.006944444444444444d0) & + + in(i-1,j-1) * (-0.041666666666666664d0) & + + in(i+1,j-1) * (-0.041666666666666664d0) & + + in(i+2,j-1) * (-0.041666666666666664d0) & + + in(i+3,j-1) * (-0.041666666666666664d0) & + + in(i+4,j-1) * (-0.041666666666666664d0) & + + in(i+5,j-1) * (-0.041666666666666664d0) & + + in(i+6,j-1) * (-0.041666666666666664d0) & + + in(i-6,j+1) * (-0.0006313131313131314d0) & + + in(i-5,j+1) * (-0.000925925925925926d0) & + + in(i-4,j+1) * (-0.001488095238095238d0) & + + in(i-3,j+1) * (-0.002777777777777778d0) & + + in(i-2,j+1) * (-0.006944444444444444d0) & + + in(i-1,j+1) * (-0.041666666666666664d0) & + + in(i+1,j+1) * (0.041666666666666664d0) & + + in(i+2,j+1) * (0.006944444444444444d0) & + + in(i+3,j+1) * (0.002777777777777778d0) & + + in(i+4,j+1) * (0.001488095238095238d0) & + + in(i+5,j+1) * (0.000925925925925926d0) & + + in(i+6,j+1) * (0.0006313131313131314d0) & + + in(i-6,j+2) * (-0.0006313131313131314d0) & + + in(i-5,j+2) * (-0.000925925925925926d0) & + + in(i-4,j+2) * (-0.001488095238095238d0) & + + in(i-3,j+2) * (-0.002777777777777778d0) & + + in(i-2,j+2) * (-0.006944444444444444d0) & + + in(i-1,j+2) * (-0.041666666666666664d0) & + + in(i+1,j+2) * (0.006944444444444444d0) & + + in(i+2,j+2) * (0.020833333333333332d0) & + + in(i+3,j+2) * (0.002777777777777778d0) & + + in(i+4,j+2) * (0.001488095238095238d0) & + + in(i+5,j+2) * (0.000925925925925926d0) & + + in(i+6,j+2) * (0.0006313131313131314d0) & + + in(i-6,j+3) * (-0.0006313131313131314d0) & + + in(i-5,j+3) * (-0.000925925925925926d0) & + + in(i-4,j+3) * (-0.001488095238095238d0) & + + in(i-3,j+3) * (-0.002777777777777778d0) & + + in(i-2,j+3) * (-0.006944444444444444d0) & + + in(i-1,j+3) * (-0.041666666666666664d0) & + + in(i+1,j+3) * (0.002777777777777778d0) & + + in(i+2,j+3) * (0.002777777777777778d0) & + + in(i+3,j+3) * (0.013888888888888888d0) & + + in(i+4,j+3) * (0.001488095238095238d0) & + + in(i+5,j+3) * (0.000925925925925926d0) & + + in(i+6,j+3) * (0.0006313131313131314d0) & + + in(i-6,j+4) * (-0.0006313131313131314d0) & + + in(i-5,j+4) * (-0.000925925925925926d0) & + + in(i-4,j+4) * (-0.001488095238095238d0) & + + in(i-3,j+4) * (-0.002777777777777778d0) & + + in(i-2,j+4) * (-0.006944444444444444d0) & + + in(i-1,j+4) * (-0.041666666666666664d0) & + + in(i+1,j+4) * (0.001488095238095238d0) & + + in(i+2,j+4) * (0.001488095238095238d0) & + + in(i+3,j+4) * (0.001488095238095238d0) & + + in(i+4,j+4) * (0.010416666666666666d0) & + + in(i+5,j+4) * (0.000925925925925926d0) & + + in(i+6,j+4) * (0.0006313131313131314d0) & + + in(i-6,j+5) * (-0.0006313131313131314d0) & + + in(i-5,j+5) * (-0.000925925925925926d0) & + + in(i-4,j+5) * (-0.001488095238095238d0) & + + in(i-3,j+5) * (-0.002777777777777778d0) & + + in(i-2,j+5) * (-0.006944444444444444d0) & + + in(i-1,j+5) * (-0.041666666666666664d0) & + + in(i+1,j+5) * (0.000925925925925926d0) & + + in(i+2,j+5) * (0.000925925925925926d0) & + + in(i+3,j+5) * (0.000925925925925926d0) & + + in(i+4,j+5) * (0.000925925925925926d0) & + + in(i+5,j+5) * (0.008333333333333333d0) & + + in(i+6,j+5) * (0.0006313131313131314d0) & + + in(i-6,j+6) * (-0.0006313131313131314d0) & + + in(i-5,j+6) * (-0.000925925925925926d0) & + + in(i-4,j+6) * (-0.001488095238095238d0) & + + in(i-3,j+6) * (-0.002777777777777778d0) & + + in(i-2,j+6) * (-0.006944444444444444d0) & + + in(i-1,j+6) * (-0.041666666666666664d0) & + + in(i+1,j+6) * (0.0006313131313131314d0) & + + in(i+2,j+6) * (0.0006313131313131314d0) & + + in(i+3,j+6) * (0.0006313131313131314d0) & + + in(i+4,j+6) * (0.0006313131313131314d0) & + + in(i+5,j+6) * (0.0006313131313131314d0) & + + in(i+6,j+6) * (0.006944444444444444d0) & +0.0 end do !$omp end simd @@ -780,165 +765,164 @@ subroutine grid7(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=7,n-7-1 - !$omp simd do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i-7,j-7) * (-0.00510204081632653) & - + in(i+1,j-7) * (-0.0003924646781789639) & - + in(i+2,j-7) * (-0.0003924646781789639) & - + in(i+3,j-7) * (-0.0003924646781789639) & - + in(i+4,j-7) * (-0.0003924646781789639) & - + in(i+5,j-7) * (-0.0003924646781789639) & - + in(i+6,j-7) * (-0.0003924646781789639) & - + in(i+7,j-7) * (-0.0003924646781789639) & - + in(i-6,j-6) * (-0.005952380952380952) & - + in(i+1,j-6) * (-0.0005411255411255411) & - + in(i+2,j-6) * (-0.0005411255411255411) & - + in(i+3,j-6) * (-0.0005411255411255411) & - + in(i+4,j-6) * (-0.0005411255411255411) & - + in(i+5,j-6) * (-0.0005411255411255411) & - + in(i+6,j-6) * (-0.0005411255411255411) & - + in(i+7,j-6) * (-0.0005411255411255411) & - + in(i-5,j-5) * (-0.007142857142857143) & - + in(i+1,j-5) * (-0.0007936507936507937) & - + in(i+2,j-5) * (-0.0007936507936507937) & - + in(i+3,j-5) * (-0.0007936507936507937) & - + in(i+4,j-5) * (-0.0007936507936507937) & - + in(i+5,j-5) * (-0.0007936507936507937) & - + in(i+6,j-5) * (-0.0007936507936507937) & - + in(i+7,j-5) * (-0.0007936507936507937) & - + in(i-4,j-4) * (-0.008928571428571428) & - + in(i+1,j-4) * (-0.0012755102040816326) & - + in(i+2,j-4) * (-0.0012755102040816326) & - + in(i+3,j-4) * (-0.0012755102040816326) & - + in(i+4,j-4) * (-0.0012755102040816326) & - + in(i+5,j-4) * (-0.0012755102040816326) & - + in(i+6,j-4) * (-0.0012755102040816326) & - + in(i+7,j-4) * (-0.0012755102040816326) & - + in(i-3,j-3) * (-0.011904761904761904) & - + in(i+1,j-3) * (-0.002380952380952381) & - + in(i+2,j-3) * (-0.002380952380952381) & - + in(i+3,j-3) * (-0.002380952380952381) & - + in(i+4,j-3) * (-0.002380952380952381) & - + in(i+5,j-3) * (-0.002380952380952381) & - + in(i+6,j-3) * (-0.002380952380952381) & - + in(i+7,j-3) * (-0.002380952380952381) & - + in(i-2,j-2) * (-0.017857142857142856) & - + in(i+1,j-2) * (-0.005952380952380952) & - + in(i+2,j-2) * (-0.005952380952380952) & - + in(i+3,j-2) * (-0.005952380952380952) & - + in(i+4,j-2) * (-0.005952380952380952) & - + in(i+5,j-2) * (-0.005952380952380952) & - + in(i+6,j-2) * (-0.005952380952380952) & - + in(i+7,j-2) * (-0.005952380952380952) & - + in(i-1,j-1) * (-0.03571428571428571) & - + in(i+1,j-1) * (-0.03571428571428571) & - + in(i+2,j-1) * (-0.03571428571428571) & - + in(i+3,j-1) * (-0.03571428571428571) & - + in(i+4,j-1) * (-0.03571428571428571) & - + in(i+5,j-1) * (-0.03571428571428571) & - + in(i+6,j-1) * (-0.03571428571428571) & - + in(i+7,j-1) * (-0.03571428571428571) & - + in(i-7,j+1) * (-0.0003924646781789639) & - + in(i-6,j+1) * (-0.0005411255411255411) & - + in(i-5,j+1) * (-0.0007936507936507937) & - + in(i-4,j+1) * (-0.0012755102040816326) & - + in(i-3,j+1) * (-0.002380952380952381) & - + in(i-2,j+1) * (-0.005952380952380952) & - + in(i-1,j+1) * (-0.03571428571428571) & - + in(i+1,j+1) * (0.03571428571428571) & - + in(i+2,j+1) * (0.005952380952380952) & - + in(i+3,j+1) * (0.002380952380952381) & - + in(i+4,j+1) * (0.0012755102040816326) & - + in(i+5,j+1) * (0.0007936507936507937) & - + in(i+6,j+1) * (0.0005411255411255411) & - + in(i+7,j+1) * (0.0003924646781789639) & - + in(i-7,j+2) * (-0.0003924646781789639) & - + in(i-6,j+2) * (-0.0005411255411255411) & - + in(i-5,j+2) * (-0.0007936507936507937) & - + in(i-4,j+2) * (-0.0012755102040816326) & - + in(i-3,j+2) * (-0.002380952380952381) & - + in(i-2,j+2) * (-0.005952380952380952) & - + in(i-1,j+2) * (-0.03571428571428571) & - + in(i+1,j+2) * (0.005952380952380952) & - + in(i+2,j+2) * (0.017857142857142856) & - + in(i+3,j+2) * (0.002380952380952381) & - + in(i+4,j+2) * (0.0012755102040816326) & - + in(i+5,j+2) * (0.0007936507936507937) & - + in(i+6,j+2) * (0.0005411255411255411) & - + in(i+7,j+2) * (0.0003924646781789639) & - + in(i-7,j+3) * (-0.0003924646781789639) & - + in(i-6,j+3) * (-0.0005411255411255411) & - + in(i-5,j+3) * (-0.0007936507936507937) & - + in(i-4,j+3) * (-0.0012755102040816326) & - + in(i-3,j+3) * (-0.002380952380952381) & - + in(i-2,j+3) * (-0.005952380952380952) & - + in(i-1,j+3) * (-0.03571428571428571) & - + in(i+1,j+3) * (0.002380952380952381) & - + in(i+2,j+3) * (0.002380952380952381) & - + in(i+3,j+3) * (0.011904761904761904) & - + in(i+4,j+3) * (0.0012755102040816326) & - + in(i+5,j+3) * (0.0007936507936507937) & - + in(i+6,j+3) * (0.0005411255411255411) & - + in(i+7,j+3) * (0.0003924646781789639) & - + in(i-7,j+4) * (-0.0003924646781789639) & - + in(i-6,j+4) * (-0.0005411255411255411) & - + in(i-5,j+4) * (-0.0007936507936507937) & - + in(i-4,j+4) * (-0.0012755102040816326) & - + in(i-3,j+4) * (-0.002380952380952381) & - + in(i-2,j+4) * (-0.005952380952380952) & - + in(i-1,j+4) * (-0.03571428571428571) & - + in(i+1,j+4) * (0.0012755102040816326) & - + in(i+2,j+4) * (0.0012755102040816326) & - + in(i+3,j+4) * (0.0012755102040816326) & - + in(i+4,j+4) * (0.008928571428571428) & - + in(i+5,j+4) * (0.0007936507936507937) & - + in(i+6,j+4) * (0.0005411255411255411) & - + in(i+7,j+4) * (0.0003924646781789639) & - + in(i-7,j+5) * (-0.0003924646781789639) & - + in(i-6,j+5) * (-0.0005411255411255411) & - + in(i-5,j+5) * (-0.0007936507936507937) & - + in(i-4,j+5) * (-0.0012755102040816326) & - + in(i-3,j+5) * (-0.002380952380952381) & - + in(i-2,j+5) * (-0.005952380952380952) & - + in(i-1,j+5) * (-0.03571428571428571) & - + in(i+1,j+5) * (0.0007936507936507937) & - + in(i+2,j+5) * (0.0007936507936507937) & - + in(i+3,j+5) * (0.0007936507936507937) & - + in(i+4,j+5) * (0.0007936507936507937) & - + in(i+5,j+5) * (0.007142857142857143) & - + in(i+6,j+5) * (0.0005411255411255411) & - + in(i+7,j+5) * (0.0003924646781789639) & - + in(i-7,j+6) * (-0.0003924646781789639) & - + in(i-6,j+6) * (-0.0005411255411255411) & - + in(i-5,j+6) * (-0.0007936507936507937) & - + in(i-4,j+6) * (-0.0012755102040816326) & - + in(i-3,j+6) * (-0.002380952380952381) & - + in(i-2,j+6) * (-0.005952380952380952) & - + in(i-1,j+6) * (-0.03571428571428571) & - + in(i+1,j+6) * (0.0005411255411255411) & - + in(i+2,j+6) * (0.0005411255411255411) & - + in(i+3,j+6) * (0.0005411255411255411) & - + in(i+4,j+6) * (0.0005411255411255411) & - + in(i+5,j+6) * (0.0005411255411255411) & - + in(i+6,j+6) * (0.005952380952380952) & - + in(i+7,j+6) * (0.0003924646781789639) & - + in(i-7,j+7) * (-0.0003924646781789639) & - + in(i-6,j+7) * (-0.0005411255411255411) & - + in(i-5,j+7) * (-0.0007936507936507937) & - + in(i-4,j+7) * (-0.0012755102040816326) & - + in(i-3,j+7) * (-0.002380952380952381) & - + in(i-2,j+7) * (-0.005952380952380952) & - + in(i-1,j+7) * (-0.03571428571428571) & - + in(i+1,j+7) * (0.0003924646781789639) & - + in(i+2,j+7) * (0.0003924646781789639) & - + in(i+3,j+7) * (0.0003924646781789639) & - + in(i+4,j+7) * (0.0003924646781789639) & - + in(i+5,j+7) * (0.0003924646781789639) & - + in(i+6,j+7) * (0.0003924646781789639) & - + in(i+7,j+7) * (0.00510204081632653) & + + in(i-7,j-7) * (-0.00510204081632653d0) & + + in(i+1,j-7) * (-0.0003924646781789639d0) & + + in(i+2,j-7) * (-0.0003924646781789639d0) & + + in(i+3,j-7) * (-0.0003924646781789639d0) & + + in(i+4,j-7) * (-0.0003924646781789639d0) & + + in(i+5,j-7) * (-0.0003924646781789639d0) & + + in(i+6,j-7) * (-0.0003924646781789639d0) & + + in(i+7,j-7) * (-0.0003924646781789639d0) & + + in(i-6,j-6) * (-0.005952380952380952d0) & + + in(i+1,j-6) * (-0.0005411255411255411d0) & + + in(i+2,j-6) * (-0.0005411255411255411d0) & + + in(i+3,j-6) * (-0.0005411255411255411d0) & + + in(i+4,j-6) * (-0.0005411255411255411d0) & + + in(i+5,j-6) * (-0.0005411255411255411d0) & + + in(i+6,j-6) * (-0.0005411255411255411d0) & + + in(i+7,j-6) * (-0.0005411255411255411d0) & + + in(i-5,j-5) * (-0.007142857142857143d0) & + + in(i+1,j-5) * (-0.0007936507936507937d0) & + + in(i+2,j-5) * (-0.0007936507936507937d0) & + + in(i+3,j-5) * (-0.0007936507936507937d0) & + + in(i+4,j-5) * (-0.0007936507936507937d0) & + + in(i+5,j-5) * (-0.0007936507936507937d0) & + + in(i+6,j-5) * (-0.0007936507936507937d0) & + + in(i+7,j-5) * (-0.0007936507936507937d0) & + + in(i-4,j-4) * (-0.008928571428571428d0) & + + in(i+1,j-4) * (-0.0012755102040816326d0) & + + in(i+2,j-4) * (-0.0012755102040816326d0) & + + in(i+3,j-4) * (-0.0012755102040816326d0) & + + in(i+4,j-4) * (-0.0012755102040816326d0) & + + in(i+5,j-4) * (-0.0012755102040816326d0) & + + in(i+6,j-4) * (-0.0012755102040816326d0) & + + in(i+7,j-4) * (-0.0012755102040816326d0) & + + in(i-3,j-3) * (-0.011904761904761904d0) & + + in(i+1,j-3) * (-0.002380952380952381d0) & + + in(i+2,j-3) * (-0.002380952380952381d0) & + + in(i+3,j-3) * (-0.002380952380952381d0) & + + in(i+4,j-3) * (-0.002380952380952381d0) & + + in(i+5,j-3) * (-0.002380952380952381d0) & + + in(i+6,j-3) * (-0.002380952380952381d0) & + + in(i+7,j-3) * (-0.002380952380952381d0) & + + in(i-2,j-2) * (-0.017857142857142856d0) & + + in(i+1,j-2) * (-0.005952380952380952d0) & + + in(i+2,j-2) * (-0.005952380952380952d0) & + + in(i+3,j-2) * (-0.005952380952380952d0) & + + in(i+4,j-2) * (-0.005952380952380952d0) & + + in(i+5,j-2) * (-0.005952380952380952d0) & + + in(i+6,j-2) * (-0.005952380952380952d0) & + + in(i+7,j-2) * (-0.005952380952380952d0) & + + in(i-1,j-1) * (-0.03571428571428571d0) & + + in(i+1,j-1) * (-0.03571428571428571d0) & + + in(i+2,j-1) * (-0.03571428571428571d0) & + + in(i+3,j-1) * (-0.03571428571428571d0) & + + in(i+4,j-1) * (-0.03571428571428571d0) & + + in(i+5,j-1) * (-0.03571428571428571d0) & + + in(i+6,j-1) * (-0.03571428571428571d0) & + + in(i+7,j-1) * (-0.03571428571428571d0) & + + in(i-7,j+1) * (-0.0003924646781789639d0) & + + in(i-6,j+1) * (-0.0005411255411255411d0) & + + in(i-5,j+1) * (-0.0007936507936507937d0) & + + in(i-4,j+1) * (-0.0012755102040816326d0) & + + in(i-3,j+1) * (-0.002380952380952381d0) & + + in(i-2,j+1) * (-0.005952380952380952d0) & + + in(i-1,j+1) * (-0.03571428571428571d0) & + + in(i+1,j+1) * (0.03571428571428571d0) & + + in(i+2,j+1) * (0.005952380952380952d0) & + + in(i+3,j+1) * (0.002380952380952381d0) & + + in(i+4,j+1) * (0.0012755102040816326d0) & + + in(i+5,j+1) * (0.0007936507936507937d0) & + + in(i+6,j+1) * (0.0005411255411255411d0) & + + in(i+7,j+1) * (0.0003924646781789639d0) & + + in(i-7,j+2) * (-0.0003924646781789639d0) & + + in(i-6,j+2) * (-0.0005411255411255411d0) & + + in(i-5,j+2) * (-0.0007936507936507937d0) & + + in(i-4,j+2) * (-0.0012755102040816326d0) & + + in(i-3,j+2) * (-0.002380952380952381d0) & + + in(i-2,j+2) * (-0.005952380952380952d0) & + + in(i-1,j+2) * (-0.03571428571428571d0) & + + in(i+1,j+2) * (0.005952380952380952d0) & + + in(i+2,j+2) * (0.017857142857142856d0) & + + in(i+3,j+2) * (0.002380952380952381d0) & + + in(i+4,j+2) * (0.0012755102040816326d0) & + + in(i+5,j+2) * (0.0007936507936507937d0) & + + in(i+6,j+2) * (0.0005411255411255411d0) & + + in(i+7,j+2) * (0.0003924646781789639d0) & + + in(i-7,j+3) * (-0.0003924646781789639d0) & + + in(i-6,j+3) * (-0.0005411255411255411d0) & + + in(i-5,j+3) * (-0.0007936507936507937d0) & + + in(i-4,j+3) * (-0.0012755102040816326d0) & + + in(i-3,j+3) * (-0.002380952380952381d0) & + + in(i-2,j+3) * (-0.005952380952380952d0) & + + in(i-1,j+3) * (-0.03571428571428571d0) & + + in(i+1,j+3) * (0.002380952380952381d0) & + + in(i+2,j+3) * (0.002380952380952381d0) & + + in(i+3,j+3) * (0.011904761904761904d0) & + + in(i+4,j+3) * (0.0012755102040816326d0) & + + in(i+5,j+3) * (0.0007936507936507937d0) & + + in(i+6,j+3) * (0.0005411255411255411d0) & + + in(i+7,j+3) * (0.0003924646781789639d0) & + + in(i-7,j+4) * (-0.0003924646781789639d0) & + + in(i-6,j+4) * (-0.0005411255411255411d0) & + + in(i-5,j+4) * (-0.0007936507936507937d0) & + + in(i-4,j+4) * (-0.0012755102040816326d0) & + + in(i-3,j+4) * (-0.002380952380952381d0) & + + in(i-2,j+4) * (-0.005952380952380952d0) & + + in(i-1,j+4) * (-0.03571428571428571d0) & + + in(i+1,j+4) * (0.0012755102040816326d0) & + + in(i+2,j+4) * (0.0012755102040816326d0) & + + in(i+3,j+4) * (0.0012755102040816326d0) & + + in(i+4,j+4) * (0.008928571428571428d0) & + + in(i+5,j+4) * (0.0007936507936507937d0) & + + in(i+6,j+4) * (0.0005411255411255411d0) & + + in(i+7,j+4) * (0.0003924646781789639d0) & + + in(i-7,j+5) * (-0.0003924646781789639d0) & + + in(i-6,j+5) * (-0.0005411255411255411d0) & + + in(i-5,j+5) * (-0.0007936507936507937d0) & + + in(i-4,j+5) * (-0.0012755102040816326d0) & + + in(i-3,j+5) * (-0.002380952380952381d0) & + + in(i-2,j+5) * (-0.005952380952380952d0) & + + in(i-1,j+5) * (-0.03571428571428571d0) & + + in(i+1,j+5) * (0.0007936507936507937d0) & + + in(i+2,j+5) * (0.0007936507936507937d0) & + + in(i+3,j+5) * (0.0007936507936507937d0) & + + in(i+4,j+5) * (0.0007936507936507937d0) & + + in(i+5,j+5) * (0.007142857142857143d0) & + + in(i+6,j+5) * (0.0005411255411255411d0) & + + in(i+7,j+5) * (0.0003924646781789639d0) & + + in(i-7,j+6) * (-0.0003924646781789639d0) & + + in(i-6,j+6) * (-0.0005411255411255411d0) & + + in(i-5,j+6) * (-0.0007936507936507937d0) & + + in(i-4,j+6) * (-0.0012755102040816326d0) & + + in(i-3,j+6) * (-0.002380952380952381d0) & + + in(i-2,j+6) * (-0.005952380952380952d0) & + + in(i-1,j+6) * (-0.03571428571428571d0) & + + in(i+1,j+6) * (0.0005411255411255411d0) & + + in(i+2,j+6) * (0.0005411255411255411d0) & + + in(i+3,j+6) * (0.0005411255411255411d0) & + + in(i+4,j+6) * (0.0005411255411255411d0) & + + in(i+5,j+6) * (0.0005411255411255411d0) & + + in(i+6,j+6) * (0.005952380952380952d0) & + + in(i+7,j+6) * (0.0003924646781789639d0) & + + in(i-7,j+7) * (-0.0003924646781789639d0) & + + in(i-6,j+7) * (-0.0005411255411255411d0) & + + in(i-5,j+7) * (-0.0007936507936507937d0) & + + in(i-4,j+7) * (-0.0012755102040816326d0) & + + in(i-3,j+7) * (-0.002380952380952381d0) & + + in(i-2,j+7) * (-0.005952380952380952d0) & + + in(i-1,j+7) * (-0.03571428571428571d0) & + + in(i+1,j+7) * (0.0003924646781789639d0) & + + in(i+2,j+7) * (0.0003924646781789639d0) & + + in(i+3,j+7) * (0.0003924646781789639d0) & + + in(i+4,j+7) * (0.0003924646781789639d0) & + + in(i+5,j+7) * (0.0003924646781789639d0) & + + in(i+6,j+7) * (0.0003924646781789639d0) & + + in(i+7,j+7) * (0.00510204081632653d0) & +0.0 end do !$omp end simd @@ -954,211 +938,210 @@ subroutine grid8(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=8,n-8-1 - !$omp simd do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i-8,j-8) * (-0.00390625) & - + in(i+1,j-8) * (-0.00026041666666666666) & - + in(i+2,j-8) * (-0.00026041666666666666) & - + in(i+3,j-8) * (-0.00026041666666666666) & - + in(i+4,j-8) * (-0.00026041666666666666) & - + in(i+5,j-8) * (-0.00026041666666666666) & - + in(i+6,j-8) * (-0.00026041666666666666) & - + in(i+7,j-8) * (-0.00026041666666666666) & - + in(i+8,j-8) * (-0.00026041666666666666) & - + in(i-7,j-7) * (-0.004464285714285714) & - + in(i+1,j-7) * (-0.00034340659340659343) & - + in(i+2,j-7) * (-0.00034340659340659343) & - + in(i+3,j-7) * (-0.00034340659340659343) & - + in(i+4,j-7) * (-0.00034340659340659343) & - + in(i+5,j-7) * (-0.00034340659340659343) & - + in(i+6,j-7) * (-0.00034340659340659343) & - + in(i+7,j-7) * (-0.00034340659340659343) & - + in(i+8,j-7) * (-0.00034340659340659343) & - + in(i-6,j-6) * (-0.005208333333333333) & - + in(i+1,j-6) * (-0.0004734848484848485) & - + in(i+2,j-6) * (-0.0004734848484848485) & - + in(i+3,j-6) * (-0.0004734848484848485) & - + in(i+4,j-6) * (-0.0004734848484848485) & - + in(i+5,j-6) * (-0.0004734848484848485) & - + in(i+6,j-6) * (-0.0004734848484848485) & - + in(i+7,j-6) * (-0.0004734848484848485) & - + in(i+8,j-6) * (-0.0004734848484848485) & - + in(i-5,j-5) * (-0.00625) & - + in(i+1,j-5) * (-0.0006944444444444445) & - + in(i+2,j-5) * (-0.0006944444444444445) & - + in(i+3,j-5) * (-0.0006944444444444445) & - + in(i+4,j-5) * (-0.0006944444444444445) & - + in(i+5,j-5) * (-0.0006944444444444445) & - + in(i+6,j-5) * (-0.0006944444444444445) & - + in(i+7,j-5) * (-0.0006944444444444445) & - + in(i+8,j-5) * (-0.0006944444444444445) & - + in(i-4,j-4) * (-0.0078125) & - + in(i+1,j-4) * (-0.0011160714285714285) & - + in(i+2,j-4) * (-0.0011160714285714285) & - + in(i+3,j-4) * (-0.0011160714285714285) & - + in(i+4,j-4) * (-0.0011160714285714285) & - + in(i+5,j-4) * (-0.0011160714285714285) & - + in(i+6,j-4) * (-0.0011160714285714285) & - + in(i+7,j-4) * (-0.0011160714285714285) & - + in(i+8,j-4) * (-0.0011160714285714285) & - + in(i-3,j-3) * (-0.010416666666666666) & - + in(i+1,j-3) * (-0.0020833333333333333) & - + in(i+2,j-3) * (-0.0020833333333333333) & - + in(i+3,j-3) * (-0.0020833333333333333) & - + in(i+4,j-3) * (-0.0020833333333333333) & - + in(i+5,j-3) * (-0.0020833333333333333) & - + in(i+6,j-3) * (-0.0020833333333333333) & - + in(i+7,j-3) * (-0.0020833333333333333) & - + in(i+8,j-3) * (-0.0020833333333333333) & - + in(i-2,j-2) * (-0.015625) & - + in(i+1,j-2) * (-0.005208333333333333) & - + in(i+2,j-2) * (-0.005208333333333333) & - + in(i+3,j-2) * (-0.005208333333333333) & - + in(i+4,j-2) * (-0.005208333333333333) & - + in(i+5,j-2) * (-0.005208333333333333) & - + in(i+6,j-2) * (-0.005208333333333333) & - + in(i+7,j-2) * (-0.005208333333333333) & - + in(i+8,j-2) * (-0.005208333333333333) & - + in(i-1,j-1) * (-0.03125) & - + in(i+1,j-1) * (-0.03125) & - + in(i+2,j-1) * (-0.03125) & - + in(i+3,j-1) * (-0.03125) & - + in(i+4,j-1) * (-0.03125) & - + in(i+5,j-1) * (-0.03125) & - + in(i+6,j-1) * (-0.03125) & - + in(i+7,j-1) * (-0.03125) & - + in(i+8,j-1) * (-0.03125) & - + in(i-8,j+1) * (-0.00026041666666666666) & - + in(i-7,j+1) * (-0.00034340659340659343) & - + in(i-6,j+1) * (-0.0004734848484848485) & - + in(i-5,j+1) * (-0.0006944444444444445) & - + in(i-4,j+1) * (-0.0011160714285714285) & - + in(i-3,j+1) * (-0.0020833333333333333) & - + in(i-2,j+1) * (-0.005208333333333333) & - + in(i-1,j+1) * (-0.03125) & - + in(i+1,j+1) * (0.03125) & - + in(i+2,j+1) * (0.005208333333333333) & - + in(i+3,j+1) * (0.0020833333333333333) & - + in(i+4,j+1) * (0.0011160714285714285) & - + in(i+5,j+1) * (0.0006944444444444445) & - + in(i+6,j+1) * (0.0004734848484848485) & - + in(i+7,j+1) * (0.00034340659340659343) & - + in(i+8,j+1) * (0.00026041666666666666) & - + in(i-8,j+2) * (-0.00026041666666666666) & - + in(i-7,j+2) * (-0.00034340659340659343) & - + in(i-6,j+2) * (-0.0004734848484848485) & - + in(i-5,j+2) * (-0.0006944444444444445) & - + in(i-4,j+2) * (-0.0011160714285714285) & - + in(i-3,j+2) * (-0.0020833333333333333) & - + in(i-2,j+2) * (-0.005208333333333333) & - + in(i-1,j+2) * (-0.03125) & - + in(i+1,j+2) * (0.005208333333333333) & - + in(i+2,j+2) * (0.015625) & - + in(i+3,j+2) * (0.0020833333333333333) & - + in(i+4,j+2) * (0.0011160714285714285) & - + in(i+5,j+2) * (0.0006944444444444445) & - + in(i+6,j+2) * (0.0004734848484848485) & - + in(i+7,j+2) * (0.00034340659340659343) & - + in(i+8,j+2) * (0.00026041666666666666) & - + in(i-8,j+3) * (-0.00026041666666666666) & - + in(i-7,j+3) * (-0.00034340659340659343) & - + in(i-6,j+3) * (-0.0004734848484848485) & - + in(i-5,j+3) * (-0.0006944444444444445) & - + in(i-4,j+3) * (-0.0011160714285714285) & - + in(i-3,j+3) * (-0.0020833333333333333) & - + in(i-2,j+3) * (-0.005208333333333333) & - + in(i-1,j+3) * (-0.03125) & - + in(i+1,j+3) * (0.0020833333333333333) & - + in(i+2,j+3) * (0.0020833333333333333) & - + in(i+3,j+3) * (0.010416666666666666) & - + in(i+4,j+3) * (0.0011160714285714285) & - + in(i+5,j+3) * (0.0006944444444444445) & - + in(i+6,j+3) * (0.0004734848484848485) & - + in(i+7,j+3) * (0.00034340659340659343) & - + in(i+8,j+3) * (0.00026041666666666666) & - + in(i-8,j+4) * (-0.00026041666666666666) & - + in(i-7,j+4) * (-0.00034340659340659343) & - + in(i-6,j+4) * (-0.0004734848484848485) & - + in(i-5,j+4) * (-0.0006944444444444445) & - + in(i-4,j+4) * (-0.0011160714285714285) & - + in(i-3,j+4) * (-0.0020833333333333333) & - + in(i-2,j+4) * (-0.005208333333333333) & - + in(i-1,j+4) * (-0.03125) & - + in(i+1,j+4) * (0.0011160714285714285) & - + in(i+2,j+4) * (0.0011160714285714285) & - + in(i+3,j+4) * (0.0011160714285714285) & - + in(i+4,j+4) * (0.0078125) & - + in(i+5,j+4) * (0.0006944444444444445) & - + in(i+6,j+4) * (0.0004734848484848485) & - + in(i+7,j+4) * (0.00034340659340659343) & - + in(i+8,j+4) * (0.00026041666666666666) & - + in(i-8,j+5) * (-0.00026041666666666666) & - + in(i-7,j+5) * (-0.00034340659340659343) & - + in(i-6,j+5) * (-0.0004734848484848485) & - + in(i-5,j+5) * (-0.0006944444444444445) & - + in(i-4,j+5) * (-0.0011160714285714285) & - + in(i-3,j+5) * (-0.0020833333333333333) & - + in(i-2,j+5) * (-0.005208333333333333) & - + in(i-1,j+5) * (-0.03125) & - + in(i+1,j+5) * (0.0006944444444444445) & - + in(i+2,j+5) * (0.0006944444444444445) & - + in(i+3,j+5) * (0.0006944444444444445) & - + in(i+4,j+5) * (0.0006944444444444445) & - + in(i+5,j+5) * (0.00625) & - + in(i+6,j+5) * (0.0004734848484848485) & - + in(i+7,j+5) * (0.00034340659340659343) & - + in(i+8,j+5) * (0.00026041666666666666) & - + in(i-8,j+6) * (-0.00026041666666666666) & - + in(i-7,j+6) * (-0.00034340659340659343) & - + in(i-6,j+6) * (-0.0004734848484848485) & - + in(i-5,j+6) * (-0.0006944444444444445) & - + in(i-4,j+6) * (-0.0011160714285714285) & - + in(i-3,j+6) * (-0.0020833333333333333) & - + in(i-2,j+6) * (-0.005208333333333333) & - + in(i-1,j+6) * (-0.03125) & - + in(i+1,j+6) * (0.0004734848484848485) & - + in(i+2,j+6) * (0.0004734848484848485) & - + in(i+3,j+6) * (0.0004734848484848485) & - + in(i+4,j+6) * (0.0004734848484848485) & - + in(i+5,j+6) * (0.0004734848484848485) & - + in(i+6,j+6) * (0.005208333333333333) & - + in(i+7,j+6) * (0.00034340659340659343) & - + in(i+8,j+6) * (0.00026041666666666666) & - + in(i-8,j+7) * (-0.00026041666666666666) & - + in(i-7,j+7) * (-0.00034340659340659343) & - + in(i-6,j+7) * (-0.0004734848484848485) & - + in(i-5,j+7) * (-0.0006944444444444445) & - + in(i-4,j+7) * (-0.0011160714285714285) & - + in(i-3,j+7) * (-0.0020833333333333333) & - + in(i-2,j+7) * (-0.005208333333333333) & - + in(i-1,j+7) * (-0.03125) & - + in(i+1,j+7) * (0.00034340659340659343) & - + in(i+2,j+7) * (0.00034340659340659343) & - + in(i+3,j+7) * (0.00034340659340659343) & - + in(i+4,j+7) * (0.00034340659340659343) & - + in(i+5,j+7) * (0.00034340659340659343) & - + in(i+6,j+7) * (0.00034340659340659343) & - + in(i+7,j+7) * (0.004464285714285714) & - + in(i+8,j+7) * (0.00026041666666666666) & - + in(i-8,j+8) * (-0.00026041666666666666) & - + in(i-7,j+8) * (-0.00034340659340659343) & - + in(i-6,j+8) * (-0.0004734848484848485) & - + in(i-5,j+8) * (-0.0006944444444444445) & - + in(i-4,j+8) * (-0.0011160714285714285) & - + in(i-3,j+8) * (-0.0020833333333333333) & - + in(i-2,j+8) * (-0.005208333333333333) & - + in(i-1,j+8) * (-0.03125) & - + in(i+1,j+8) * (0.00026041666666666666) & - + in(i+2,j+8) * (0.00026041666666666666) & - + in(i+3,j+8) * (0.00026041666666666666) & - + in(i+4,j+8) * (0.00026041666666666666) & - + in(i+5,j+8) * (0.00026041666666666666) & - + in(i+6,j+8) * (0.00026041666666666666) & - + in(i+7,j+8) * (0.00026041666666666666) & - + in(i+8,j+8) * (0.00390625) & + + in(i-8,j-8) * (-0.00390625d0) & + + in(i+1,j-8) * (-0.00026041666666666666d0) & + + in(i+2,j-8) * (-0.00026041666666666666d0) & + + in(i+3,j-8) * (-0.00026041666666666666d0) & + + in(i+4,j-8) * (-0.00026041666666666666d0) & + + in(i+5,j-8) * (-0.00026041666666666666d0) & + + in(i+6,j-8) * (-0.00026041666666666666d0) & + + in(i+7,j-8) * (-0.00026041666666666666d0) & + + in(i+8,j-8) * (-0.00026041666666666666d0) & + + in(i-7,j-7) * (-0.004464285714285714d0) & + + in(i+1,j-7) * (-0.00034340659340659343d0) & + + in(i+2,j-7) * (-0.00034340659340659343d0) & + + in(i+3,j-7) * (-0.00034340659340659343d0) & + + in(i+4,j-7) * (-0.00034340659340659343d0) & + + in(i+5,j-7) * (-0.00034340659340659343d0) & + + in(i+6,j-7) * (-0.00034340659340659343d0) & + + in(i+7,j-7) * (-0.00034340659340659343d0) & + + in(i+8,j-7) * (-0.00034340659340659343d0) & + + in(i-6,j-6) * (-0.005208333333333333d0) & + + in(i+1,j-6) * (-0.0004734848484848485d0) & + + in(i+2,j-6) * (-0.0004734848484848485d0) & + + in(i+3,j-6) * (-0.0004734848484848485d0) & + + in(i+4,j-6) * (-0.0004734848484848485d0) & + + in(i+5,j-6) * (-0.0004734848484848485d0) & + + in(i+6,j-6) * (-0.0004734848484848485d0) & + + in(i+7,j-6) * (-0.0004734848484848485d0) & + + in(i+8,j-6) * (-0.0004734848484848485d0) & + + in(i-5,j-5) * (-0.00625d0) & + + in(i+1,j-5) * (-0.0006944444444444445d0) & + + in(i+2,j-5) * (-0.0006944444444444445d0) & + + in(i+3,j-5) * (-0.0006944444444444445d0) & + + in(i+4,j-5) * (-0.0006944444444444445d0) & + + in(i+5,j-5) * (-0.0006944444444444445d0) & + + in(i+6,j-5) * (-0.0006944444444444445d0) & + + in(i+7,j-5) * (-0.0006944444444444445d0) & + + in(i+8,j-5) * (-0.0006944444444444445d0) & + + in(i-4,j-4) * (-0.0078125d0) & + + in(i+1,j-4) * (-0.0011160714285714285d0) & + + in(i+2,j-4) * (-0.0011160714285714285d0) & + + in(i+3,j-4) * (-0.0011160714285714285d0) & + + in(i+4,j-4) * (-0.0011160714285714285d0) & + + in(i+5,j-4) * (-0.0011160714285714285d0) & + + in(i+6,j-4) * (-0.0011160714285714285d0) & + + in(i+7,j-4) * (-0.0011160714285714285d0) & + + in(i+8,j-4) * (-0.0011160714285714285d0) & + + in(i-3,j-3) * (-0.010416666666666666d0) & + + in(i+1,j-3) * (-0.0020833333333333333d0) & + + in(i+2,j-3) * (-0.0020833333333333333d0) & + + in(i+3,j-3) * (-0.0020833333333333333d0) & + + in(i+4,j-3) * (-0.0020833333333333333d0) & + + in(i+5,j-3) * (-0.0020833333333333333d0) & + + in(i+6,j-3) * (-0.0020833333333333333d0) & + + in(i+7,j-3) * (-0.0020833333333333333d0) & + + in(i+8,j-3) * (-0.0020833333333333333d0) & + + in(i-2,j-2) * (-0.015625d0) & + + in(i+1,j-2) * (-0.005208333333333333d0) & + + in(i+2,j-2) * (-0.005208333333333333d0) & + + in(i+3,j-2) * (-0.005208333333333333d0) & + + in(i+4,j-2) * (-0.005208333333333333d0) & + + in(i+5,j-2) * (-0.005208333333333333d0) & + + in(i+6,j-2) * (-0.005208333333333333d0) & + + in(i+7,j-2) * (-0.005208333333333333d0) & + + in(i+8,j-2) * (-0.005208333333333333d0) & + + in(i-1,j-1) * (-0.03125d0) & + + in(i+1,j-1) * (-0.03125d0) & + + in(i+2,j-1) * (-0.03125d0) & + + in(i+3,j-1) * (-0.03125d0) & + + in(i+4,j-1) * (-0.03125d0) & + + in(i+5,j-1) * (-0.03125d0) & + + in(i+6,j-1) * (-0.03125d0) & + + in(i+7,j-1) * (-0.03125d0) & + + in(i+8,j-1) * (-0.03125d0) & + + in(i-8,j+1) * (-0.00026041666666666666d0) & + + in(i-7,j+1) * (-0.00034340659340659343d0) & + + in(i-6,j+1) * (-0.0004734848484848485d0) & + + in(i-5,j+1) * (-0.0006944444444444445d0) & + + in(i-4,j+1) * (-0.0011160714285714285d0) & + + in(i-3,j+1) * (-0.0020833333333333333d0) & + + in(i-2,j+1) * (-0.005208333333333333d0) & + + in(i-1,j+1) * (-0.03125d0) & + + in(i+1,j+1) * (0.03125d0) & + + in(i+2,j+1) * (0.005208333333333333d0) & + + in(i+3,j+1) * (0.0020833333333333333d0) & + + in(i+4,j+1) * (0.0011160714285714285d0) & + + in(i+5,j+1) * (0.0006944444444444445d0) & + + in(i+6,j+1) * (0.0004734848484848485d0) & + + in(i+7,j+1) * (0.00034340659340659343d0) & + + in(i+8,j+1) * (0.00026041666666666666d0) & + + in(i-8,j+2) * (-0.00026041666666666666d0) & + + in(i-7,j+2) * (-0.00034340659340659343d0) & + + in(i-6,j+2) * (-0.0004734848484848485d0) & + + in(i-5,j+2) * (-0.0006944444444444445d0) & + + in(i-4,j+2) * (-0.0011160714285714285d0) & + + in(i-3,j+2) * (-0.0020833333333333333d0) & + + in(i-2,j+2) * (-0.005208333333333333d0) & + + in(i-1,j+2) * (-0.03125d0) & + + in(i+1,j+2) * (0.005208333333333333d0) & + + in(i+2,j+2) * (0.015625d0) & + + in(i+3,j+2) * (0.0020833333333333333d0) & + + in(i+4,j+2) * (0.0011160714285714285d0) & + + in(i+5,j+2) * (0.0006944444444444445d0) & + + in(i+6,j+2) * (0.0004734848484848485d0) & + + in(i+7,j+2) * (0.00034340659340659343d0) & + + in(i+8,j+2) * (0.00026041666666666666d0) & + + in(i-8,j+3) * (-0.00026041666666666666d0) & + + in(i-7,j+3) * (-0.00034340659340659343d0) & + + in(i-6,j+3) * (-0.0004734848484848485d0) & + + in(i-5,j+3) * (-0.0006944444444444445d0) & + + in(i-4,j+3) * (-0.0011160714285714285d0) & + + in(i-3,j+3) * (-0.0020833333333333333d0) & + + in(i-2,j+3) * (-0.005208333333333333d0) & + + in(i-1,j+3) * (-0.03125d0) & + + in(i+1,j+3) * (0.0020833333333333333d0) & + + in(i+2,j+3) * (0.0020833333333333333d0) & + + in(i+3,j+3) * (0.010416666666666666d0) & + + in(i+4,j+3) * (0.0011160714285714285d0) & + + in(i+5,j+3) * (0.0006944444444444445d0) & + + in(i+6,j+3) * (0.0004734848484848485d0) & + + in(i+7,j+3) * (0.00034340659340659343d0) & + + in(i+8,j+3) * (0.00026041666666666666d0) & + + in(i-8,j+4) * (-0.00026041666666666666d0) & + + in(i-7,j+4) * (-0.00034340659340659343d0) & + + in(i-6,j+4) * (-0.0004734848484848485d0) & + + in(i-5,j+4) * (-0.0006944444444444445d0) & + + in(i-4,j+4) * (-0.0011160714285714285d0) & + + in(i-3,j+4) * (-0.0020833333333333333d0) & + + in(i-2,j+4) * (-0.005208333333333333d0) & + + in(i-1,j+4) * (-0.03125d0) & + + in(i+1,j+4) * (0.0011160714285714285d0) & + + in(i+2,j+4) * (0.0011160714285714285d0) & + + in(i+3,j+4) * (0.0011160714285714285d0) & + + in(i+4,j+4) * (0.0078125d0) & + + in(i+5,j+4) * (0.0006944444444444445d0) & + + in(i+6,j+4) * (0.0004734848484848485d0) & + + in(i+7,j+4) * (0.00034340659340659343d0) & + + in(i+8,j+4) * (0.00026041666666666666d0) & + + in(i-8,j+5) * (-0.00026041666666666666d0) & + + in(i-7,j+5) * (-0.00034340659340659343d0) & + + in(i-6,j+5) * (-0.0004734848484848485d0) & + + in(i-5,j+5) * (-0.0006944444444444445d0) & + + in(i-4,j+5) * (-0.0011160714285714285d0) & + + in(i-3,j+5) * (-0.0020833333333333333d0) & + + in(i-2,j+5) * (-0.005208333333333333d0) & + + in(i-1,j+5) * (-0.03125d0) & + + in(i+1,j+5) * (0.0006944444444444445d0) & + + in(i+2,j+5) * (0.0006944444444444445d0) & + + in(i+3,j+5) * (0.0006944444444444445d0) & + + in(i+4,j+5) * (0.0006944444444444445d0) & + + in(i+5,j+5) * (0.00625d0) & + + in(i+6,j+5) * (0.0004734848484848485d0) & + + in(i+7,j+5) * (0.00034340659340659343d0) & + + in(i+8,j+5) * (0.00026041666666666666d0) & + + in(i-8,j+6) * (-0.00026041666666666666d0) & + + in(i-7,j+6) * (-0.00034340659340659343d0) & + + in(i-6,j+6) * (-0.0004734848484848485d0) & + + in(i-5,j+6) * (-0.0006944444444444445d0) & + + in(i-4,j+6) * (-0.0011160714285714285d0) & + + in(i-3,j+6) * (-0.0020833333333333333d0) & + + in(i-2,j+6) * (-0.005208333333333333d0) & + + in(i-1,j+6) * (-0.03125d0) & + + in(i+1,j+6) * (0.0004734848484848485d0) & + + in(i+2,j+6) * (0.0004734848484848485d0) & + + in(i+3,j+6) * (0.0004734848484848485d0) & + + in(i+4,j+6) * (0.0004734848484848485d0) & + + in(i+5,j+6) * (0.0004734848484848485d0) & + + in(i+6,j+6) * (0.005208333333333333d0) & + + in(i+7,j+6) * (0.00034340659340659343d0) & + + in(i+8,j+6) * (0.00026041666666666666d0) & + + in(i-8,j+7) * (-0.00026041666666666666d0) & + + in(i-7,j+7) * (-0.00034340659340659343d0) & + + in(i-6,j+7) * (-0.0004734848484848485d0) & + + in(i-5,j+7) * (-0.0006944444444444445d0) & + + in(i-4,j+7) * (-0.0011160714285714285d0) & + + in(i-3,j+7) * (-0.0020833333333333333d0) & + + in(i-2,j+7) * (-0.005208333333333333d0) & + + in(i-1,j+7) * (-0.03125d0) & + + in(i+1,j+7) * (0.00034340659340659343d0) & + + in(i+2,j+7) * (0.00034340659340659343d0) & + + in(i+3,j+7) * (0.00034340659340659343d0) & + + in(i+4,j+7) * (0.00034340659340659343d0) & + + in(i+5,j+7) * (0.00034340659340659343d0) & + + in(i+6,j+7) * (0.00034340659340659343d0) & + + in(i+7,j+7) * (0.004464285714285714d0) & + + in(i+8,j+7) * (0.00026041666666666666d0) & + + in(i-8,j+8) * (-0.00026041666666666666d0) & + + in(i-7,j+8) * (-0.00034340659340659343d0) & + + in(i-6,j+8) * (-0.0004734848484848485d0) & + + in(i-5,j+8) * (-0.0006944444444444445d0) & + + in(i-4,j+8) * (-0.0011160714285714285d0) & + + in(i-3,j+8) * (-0.0020833333333333333d0) & + + in(i-2,j+8) * (-0.005208333333333333d0) & + + in(i-1,j+8) * (-0.03125d0) & + + in(i+1,j+8) * (0.00026041666666666666d0) & + + in(i+2,j+8) * (0.00026041666666666666d0) & + + in(i+3,j+8) * (0.00026041666666666666d0) & + + in(i+4,j+8) * (0.00026041666666666666d0) & + + in(i+5,j+8) * (0.00026041666666666666d0) & + + in(i+6,j+8) * (0.00026041666666666666d0) & + + in(i+7,j+8) * (0.00026041666666666666d0) & + + in(i+8,j+8) * (0.00390625d0) & +0.0 end do !$omp end simd @@ -1174,263 +1157,262 @@ subroutine grid9(n, in, out) real(kind=REAL64), intent(in) :: in(n,n) real(kind=REAL64), intent(inout) :: out(n,n) integer(kind=INT32) :: i,j - !$omp do + !$omp teams distribute parallel for simd collapse(2) schedule(static,1) do i=9,n-9-1 - !$omp simd do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i-9,j-9) * (-0.0030864197530864196) & - + in(i+1,j-9) * (-0.00018155410312273057) & - + in(i+2,j-9) * (-0.00018155410312273057) & - + in(i+3,j-9) * (-0.00018155410312273057) & - + in(i+4,j-9) * (-0.00018155410312273057) & - + in(i+5,j-9) * (-0.00018155410312273057) & - + in(i+6,j-9) * (-0.00018155410312273057) & - + in(i+7,j-9) * (-0.00018155410312273057) & - + in(i+8,j-9) * (-0.00018155410312273057) & - + in(i+9,j-9) * (-0.00018155410312273057) & - + in(i-8,j-8) * (-0.003472222222222222) & - + in(i+1,j-8) * (-0.0002314814814814815) & - + in(i+2,j-8) * (-0.0002314814814814815) & - + in(i+3,j-8) * (-0.0002314814814814815) & - + in(i+4,j-8) * (-0.0002314814814814815) & - + in(i+5,j-8) * (-0.0002314814814814815) & - + in(i+6,j-8) * (-0.0002314814814814815) & - + in(i+7,j-8) * (-0.0002314814814814815) & - + in(i+8,j-8) * (-0.0002314814814814815) & - + in(i+9,j-8) * (-0.0002314814814814815) & - + in(i-7,j-7) * (-0.003968253968253968) & - + in(i+1,j-7) * (-0.00030525030525030525) & - + in(i+2,j-7) * (-0.00030525030525030525) & - + in(i+3,j-7) * (-0.00030525030525030525) & - + in(i+4,j-7) * (-0.00030525030525030525) & - + in(i+5,j-7) * (-0.00030525030525030525) & - + in(i+6,j-7) * (-0.00030525030525030525) & - + in(i+7,j-7) * (-0.00030525030525030525) & - + in(i+8,j-7) * (-0.00030525030525030525) & - + in(i+9,j-7) * (-0.00030525030525030525) & - + in(i-6,j-6) * (-0.004629629629629629) & - + in(i+1,j-6) * (-0.00042087542087542086) & - + in(i+2,j-6) * (-0.00042087542087542086) & - + in(i+3,j-6) * (-0.00042087542087542086) & - + in(i+4,j-6) * (-0.00042087542087542086) & - + in(i+5,j-6) * (-0.00042087542087542086) & - + in(i+6,j-6) * (-0.00042087542087542086) & - + in(i+7,j-6) * (-0.00042087542087542086) & - + in(i+8,j-6) * (-0.00042087542087542086) & - + in(i+9,j-6) * (-0.00042087542087542086) & - + in(i-5,j-5) * (-0.005555555555555556) & - + in(i+1,j-5) * (-0.0006172839506172839) & - + in(i+2,j-5) * (-0.0006172839506172839) & - + in(i+3,j-5) * (-0.0006172839506172839) & - + in(i+4,j-5) * (-0.0006172839506172839) & - + in(i+5,j-5) * (-0.0006172839506172839) & - + in(i+6,j-5) * (-0.0006172839506172839) & - + in(i+7,j-5) * (-0.0006172839506172839) & - + in(i+8,j-5) * (-0.0006172839506172839) & - + in(i+9,j-5) * (-0.0006172839506172839) & - + in(i-4,j-4) * (-0.006944444444444444) & - + in(i+1,j-4) * (-0.000992063492063492) & - + in(i+2,j-4) * (-0.000992063492063492) & - + in(i+3,j-4) * (-0.000992063492063492) & - + in(i+4,j-4) * (-0.000992063492063492) & - + in(i+5,j-4) * (-0.000992063492063492) & - + in(i+6,j-4) * (-0.000992063492063492) & - + in(i+7,j-4) * (-0.000992063492063492) & - + in(i+8,j-4) * (-0.000992063492063492) & - + in(i+9,j-4) * (-0.000992063492063492) & - + in(i-3,j-3) * (-0.009259259259259259) & - + in(i+1,j-3) * (-0.001851851851851852) & - + in(i+2,j-3) * (-0.001851851851851852) & - + in(i+3,j-3) * (-0.001851851851851852) & - + in(i+4,j-3) * (-0.001851851851851852) & - + in(i+5,j-3) * (-0.001851851851851852) & - + in(i+6,j-3) * (-0.001851851851851852) & - + in(i+7,j-3) * (-0.001851851851851852) & - + in(i+8,j-3) * (-0.001851851851851852) & - + in(i+9,j-3) * (-0.001851851851851852) & - + in(i-2,j-2) * (-0.013888888888888888) & - + in(i+1,j-2) * (-0.004629629629629629) & - + in(i+2,j-2) * (-0.004629629629629629) & - + in(i+3,j-2) * (-0.004629629629629629) & - + in(i+4,j-2) * (-0.004629629629629629) & - + in(i+5,j-2) * (-0.004629629629629629) & - + in(i+6,j-2) * (-0.004629629629629629) & - + in(i+7,j-2) * (-0.004629629629629629) & - + in(i+8,j-2) * (-0.004629629629629629) & - + in(i+9,j-2) * (-0.004629629629629629) & - + in(i-1,j-1) * (-0.027777777777777776) & - + in(i+1,j-1) * (-0.027777777777777776) & - + in(i+2,j-1) * (-0.027777777777777776) & - + in(i+3,j-1) * (-0.027777777777777776) & - + in(i+4,j-1) * (-0.027777777777777776) & - + in(i+5,j-1) * (-0.027777777777777776) & - + in(i+6,j-1) * (-0.027777777777777776) & - + in(i+7,j-1) * (-0.027777777777777776) & - + in(i+8,j-1) * (-0.027777777777777776) & - + in(i+9,j-1) * (-0.027777777777777776) & - + in(i-9,j+1) * (-0.00018155410312273057) & - + in(i-8,j+1) * (-0.0002314814814814815) & - + in(i-7,j+1) * (-0.00030525030525030525) & - + in(i-6,j+1) * (-0.00042087542087542086) & - + in(i-5,j+1) * (-0.0006172839506172839) & - + in(i-4,j+1) * (-0.000992063492063492) & - + in(i-3,j+1) * (-0.001851851851851852) & - + in(i-2,j+1) * (-0.004629629629629629) & - + in(i-1,j+1) * (-0.027777777777777776) & - + in(i+1,j+1) * (0.027777777777777776) & - + in(i+2,j+1) * (0.004629629629629629) & - + in(i+3,j+1) * (0.001851851851851852) & - + in(i+4,j+1) * (0.000992063492063492) & - + in(i+5,j+1) * (0.0006172839506172839) & - + in(i+6,j+1) * (0.00042087542087542086) & - + in(i+7,j+1) * (0.00030525030525030525) & - + in(i+8,j+1) * (0.0002314814814814815) & - + in(i+9,j+1) * (0.00018155410312273057) & - + in(i-9,j+2) * (-0.00018155410312273057) & - + in(i-8,j+2) * (-0.0002314814814814815) & - + in(i-7,j+2) * (-0.00030525030525030525) & - + in(i-6,j+2) * (-0.00042087542087542086) & - + in(i-5,j+2) * (-0.0006172839506172839) & - + in(i-4,j+2) * (-0.000992063492063492) & - + in(i-3,j+2) * (-0.001851851851851852) & - + in(i-2,j+2) * (-0.004629629629629629) & - + in(i-1,j+2) * (-0.027777777777777776) & - + in(i+1,j+2) * (0.004629629629629629) & - + in(i+2,j+2) * (0.013888888888888888) & - + in(i+3,j+2) * (0.001851851851851852) & - + in(i+4,j+2) * (0.000992063492063492) & - + in(i+5,j+2) * (0.0006172839506172839) & - + in(i+6,j+2) * (0.00042087542087542086) & - + in(i+7,j+2) * (0.00030525030525030525) & - + in(i+8,j+2) * (0.0002314814814814815) & - + in(i+9,j+2) * (0.00018155410312273057) & - + in(i-9,j+3) * (-0.00018155410312273057) & - + in(i-8,j+3) * (-0.0002314814814814815) & - + in(i-7,j+3) * (-0.00030525030525030525) & - + in(i-6,j+3) * (-0.00042087542087542086) & - + in(i-5,j+3) * (-0.0006172839506172839) & - + in(i-4,j+3) * (-0.000992063492063492) & - + in(i-3,j+3) * (-0.001851851851851852) & - + in(i-2,j+3) * (-0.004629629629629629) & - + in(i-1,j+3) * (-0.027777777777777776) & - + in(i+1,j+3) * (0.001851851851851852) & - + in(i+2,j+3) * (0.001851851851851852) & - + in(i+3,j+3) * (0.009259259259259259) & - + in(i+4,j+3) * (0.000992063492063492) & - + in(i+5,j+3) * (0.0006172839506172839) & - + in(i+6,j+3) * (0.00042087542087542086) & - + in(i+7,j+3) * (0.00030525030525030525) & - + in(i+8,j+3) * (0.0002314814814814815) & - + in(i+9,j+3) * (0.00018155410312273057) & - + in(i-9,j+4) * (-0.00018155410312273057) & - + in(i-8,j+4) * (-0.0002314814814814815) & - + in(i-7,j+4) * (-0.00030525030525030525) & - + in(i-6,j+4) * (-0.00042087542087542086) & - + in(i-5,j+4) * (-0.0006172839506172839) & - + in(i-4,j+4) * (-0.000992063492063492) & - + in(i-3,j+4) * (-0.001851851851851852) & - + in(i-2,j+4) * (-0.004629629629629629) & - + in(i-1,j+4) * (-0.027777777777777776) & - + in(i+1,j+4) * (0.000992063492063492) & - + in(i+2,j+4) * (0.000992063492063492) & - + in(i+3,j+4) * (0.000992063492063492) & - + in(i+4,j+4) * (0.006944444444444444) & - + in(i+5,j+4) * (0.0006172839506172839) & - + in(i+6,j+4) * (0.00042087542087542086) & - + in(i+7,j+4) * (0.00030525030525030525) & - + in(i+8,j+4) * (0.0002314814814814815) & - + in(i+9,j+4) * (0.00018155410312273057) & - + in(i-9,j+5) * (-0.00018155410312273057) & - + in(i-8,j+5) * (-0.0002314814814814815) & - + in(i-7,j+5) * (-0.00030525030525030525) & - + in(i-6,j+5) * (-0.00042087542087542086) & - + in(i-5,j+5) * (-0.0006172839506172839) & - + in(i-4,j+5) * (-0.000992063492063492) & - + in(i-3,j+5) * (-0.001851851851851852) & - + in(i-2,j+5) * (-0.004629629629629629) & - + in(i-1,j+5) * (-0.027777777777777776) & - + in(i+1,j+5) * (0.0006172839506172839) & - + in(i+2,j+5) * (0.0006172839506172839) & - + in(i+3,j+5) * (0.0006172839506172839) & - + in(i+4,j+5) * (0.0006172839506172839) & - + in(i+5,j+5) * (0.005555555555555556) & - + in(i+6,j+5) * (0.00042087542087542086) & - + in(i+7,j+5) * (0.00030525030525030525) & - + in(i+8,j+5) * (0.0002314814814814815) & - + in(i+9,j+5) * (0.00018155410312273057) & - + in(i-9,j+6) * (-0.00018155410312273057) & - + in(i-8,j+6) * (-0.0002314814814814815) & - + in(i-7,j+6) * (-0.00030525030525030525) & - + in(i-6,j+6) * (-0.00042087542087542086) & - + in(i-5,j+6) * (-0.0006172839506172839) & - + in(i-4,j+6) * (-0.000992063492063492) & - + in(i-3,j+6) * (-0.001851851851851852) & - + in(i-2,j+6) * (-0.004629629629629629) & - + in(i-1,j+6) * (-0.027777777777777776) & - + in(i+1,j+6) * (0.00042087542087542086) & - + in(i+2,j+6) * (0.00042087542087542086) & - + in(i+3,j+6) * (0.00042087542087542086) & - + in(i+4,j+6) * (0.00042087542087542086) & - + in(i+5,j+6) * (0.00042087542087542086) & - + in(i+6,j+6) * (0.004629629629629629) & - + in(i+7,j+6) * (0.00030525030525030525) & - + in(i+8,j+6) * (0.0002314814814814815) & - + in(i+9,j+6) * (0.00018155410312273057) & - + in(i-9,j+7) * (-0.00018155410312273057) & - + in(i-8,j+7) * (-0.0002314814814814815) & - + in(i-7,j+7) * (-0.00030525030525030525) & - + in(i-6,j+7) * (-0.00042087542087542086) & - + in(i-5,j+7) * (-0.0006172839506172839) & - + in(i-4,j+7) * (-0.000992063492063492) & - + in(i-3,j+7) * (-0.001851851851851852) & - + in(i-2,j+7) * (-0.004629629629629629) & - + in(i-1,j+7) * (-0.027777777777777776) & - + in(i+1,j+7) * (0.00030525030525030525) & - + in(i+2,j+7) * (0.00030525030525030525) & - + in(i+3,j+7) * (0.00030525030525030525) & - + in(i+4,j+7) * (0.00030525030525030525) & - + in(i+5,j+7) * (0.00030525030525030525) & - + in(i+6,j+7) * (0.00030525030525030525) & - + in(i+7,j+7) * (0.003968253968253968) & - + in(i+8,j+7) * (0.0002314814814814815) & - + in(i+9,j+7) * (0.00018155410312273057) & - + in(i-9,j+8) * (-0.00018155410312273057) & - + in(i-8,j+8) * (-0.0002314814814814815) & - + in(i-7,j+8) * (-0.00030525030525030525) & - + in(i-6,j+8) * (-0.00042087542087542086) & - + in(i-5,j+8) * (-0.0006172839506172839) & - + in(i-4,j+8) * (-0.000992063492063492) & - + in(i-3,j+8) * (-0.001851851851851852) & - + in(i-2,j+8) * (-0.004629629629629629) & - + in(i-1,j+8) * (-0.027777777777777776) & - + in(i+1,j+8) * (0.0002314814814814815) & - + in(i+2,j+8) * (0.0002314814814814815) & - + in(i+3,j+8) * (0.0002314814814814815) & - + in(i+4,j+8) * (0.0002314814814814815) & - + in(i+5,j+8) * (0.0002314814814814815) & - + in(i+6,j+8) * (0.0002314814814814815) & - + in(i+7,j+8) * (0.0002314814814814815) & - + in(i+8,j+8) * (0.003472222222222222) & - + in(i+9,j+8) * (0.00018155410312273057) & - + in(i-9,j+9) * (-0.00018155410312273057) & - + in(i-8,j+9) * (-0.0002314814814814815) & - + in(i-7,j+9) * (-0.00030525030525030525) & - + in(i-6,j+9) * (-0.00042087542087542086) & - + in(i-5,j+9) * (-0.0006172839506172839) & - + in(i-4,j+9) * (-0.000992063492063492) & - + in(i-3,j+9) * (-0.001851851851851852) & - + in(i-2,j+9) * (-0.004629629629629629) & - + in(i-1,j+9) * (-0.027777777777777776) & - + in(i+1,j+9) * (0.00018155410312273057) & - + in(i+2,j+9) * (0.00018155410312273057) & - + in(i+3,j+9) * (0.00018155410312273057) & - + in(i+4,j+9) * (0.00018155410312273057) & - + in(i+5,j+9) * (0.00018155410312273057) & - + in(i+6,j+9) * (0.00018155410312273057) & - + in(i+7,j+9) * (0.00018155410312273057) & - + in(i+8,j+9) * (0.00018155410312273057) & - + in(i+9,j+9) * (0.0030864197530864196) & + + in(i-9,j-9) * (-0.0030864197530864196d0) & + + in(i+1,j-9) * (-0.00018155410312273057d0) & + + in(i+2,j-9) * (-0.00018155410312273057d0) & + + in(i+3,j-9) * (-0.00018155410312273057d0) & + + in(i+4,j-9) * (-0.00018155410312273057d0) & + + in(i+5,j-9) * (-0.00018155410312273057d0) & + + in(i+6,j-9) * (-0.00018155410312273057d0) & + + in(i+7,j-9) * (-0.00018155410312273057d0) & + + in(i+8,j-9) * (-0.00018155410312273057d0) & + + in(i+9,j-9) * (-0.00018155410312273057d0) & + + in(i-8,j-8) * (-0.003472222222222222d0) & + + in(i+1,j-8) * (-0.0002314814814814815d0) & + + in(i+2,j-8) * (-0.0002314814814814815d0) & + + in(i+3,j-8) * (-0.0002314814814814815d0) & + + in(i+4,j-8) * (-0.0002314814814814815d0) & + + in(i+5,j-8) * (-0.0002314814814814815d0) & + + in(i+6,j-8) * (-0.0002314814814814815d0) & + + in(i+7,j-8) * (-0.0002314814814814815d0) & + + in(i+8,j-8) * (-0.0002314814814814815d0) & + + in(i+9,j-8) * (-0.0002314814814814815d0) & + + in(i-7,j-7) * (-0.003968253968253968d0) & + + in(i+1,j-7) * (-0.00030525030525030525d0) & + + in(i+2,j-7) * (-0.00030525030525030525d0) & + + in(i+3,j-7) * (-0.00030525030525030525d0) & + + in(i+4,j-7) * (-0.00030525030525030525d0) & + + in(i+5,j-7) * (-0.00030525030525030525d0) & + + in(i+6,j-7) * (-0.00030525030525030525d0) & + + in(i+7,j-7) * (-0.00030525030525030525d0) & + + in(i+8,j-7) * (-0.00030525030525030525d0) & + + in(i+9,j-7) * (-0.00030525030525030525d0) & + + in(i-6,j-6) * (-0.004629629629629629d0) & + + in(i+1,j-6) * (-0.00042087542087542086d0) & + + in(i+2,j-6) * (-0.00042087542087542086d0) & + + in(i+3,j-6) * (-0.00042087542087542086d0) & + + in(i+4,j-6) * (-0.00042087542087542086d0) & + + in(i+5,j-6) * (-0.00042087542087542086d0) & + + in(i+6,j-6) * (-0.00042087542087542086d0) & + + in(i+7,j-6) * (-0.00042087542087542086d0) & + + in(i+8,j-6) * (-0.00042087542087542086d0) & + + in(i+9,j-6) * (-0.00042087542087542086d0) & + + in(i-5,j-5) * (-0.005555555555555556d0) & + + in(i+1,j-5) * (-0.0006172839506172839d0) & + + in(i+2,j-5) * (-0.0006172839506172839d0) & + + in(i+3,j-5) * (-0.0006172839506172839d0) & + + in(i+4,j-5) * (-0.0006172839506172839d0) & + + in(i+5,j-5) * (-0.0006172839506172839d0) & + + in(i+6,j-5) * (-0.0006172839506172839d0) & + + in(i+7,j-5) * (-0.0006172839506172839d0) & + + in(i+8,j-5) * (-0.0006172839506172839d0) & + + in(i+9,j-5) * (-0.0006172839506172839d0) & + + in(i-4,j-4) * (-0.006944444444444444d0) & + + in(i+1,j-4) * (-0.000992063492063492d0) & + + in(i+2,j-4) * (-0.000992063492063492d0) & + + in(i+3,j-4) * (-0.000992063492063492d0) & + + in(i+4,j-4) * (-0.000992063492063492d0) & + + in(i+5,j-4) * (-0.000992063492063492d0) & + + in(i+6,j-4) * (-0.000992063492063492d0) & + + in(i+7,j-4) * (-0.000992063492063492d0) & + + in(i+8,j-4) * (-0.000992063492063492d0) & + + in(i+9,j-4) * (-0.000992063492063492d0) & + + in(i-3,j-3) * (-0.009259259259259259d0) & + + in(i+1,j-3) * (-0.001851851851851852d0) & + + in(i+2,j-3) * (-0.001851851851851852d0) & + + in(i+3,j-3) * (-0.001851851851851852d0) & + + in(i+4,j-3) * (-0.001851851851851852d0) & + + in(i+5,j-3) * (-0.001851851851851852d0) & + + in(i+6,j-3) * (-0.001851851851851852d0) & + + in(i+7,j-3) * (-0.001851851851851852d0) & + + in(i+8,j-3) * (-0.001851851851851852d0) & + + in(i+9,j-3) * (-0.001851851851851852d0) & + + in(i-2,j-2) * (-0.013888888888888888d0) & + + in(i+1,j-2) * (-0.004629629629629629d0) & + + in(i+2,j-2) * (-0.004629629629629629d0) & + + in(i+3,j-2) * (-0.004629629629629629d0) & + + in(i+4,j-2) * (-0.004629629629629629d0) & + + in(i+5,j-2) * (-0.004629629629629629d0) & + + in(i+6,j-2) * (-0.004629629629629629d0) & + + in(i+7,j-2) * (-0.004629629629629629d0) & + + in(i+8,j-2) * (-0.004629629629629629d0) & + + in(i+9,j-2) * (-0.004629629629629629d0) & + + in(i-1,j-1) * (-0.027777777777777776d0) & + + in(i+1,j-1) * (-0.027777777777777776d0) & + + in(i+2,j-1) * (-0.027777777777777776d0) & + + in(i+3,j-1) * (-0.027777777777777776d0) & + + in(i+4,j-1) * (-0.027777777777777776d0) & + + in(i+5,j-1) * (-0.027777777777777776d0) & + + in(i+6,j-1) * (-0.027777777777777776d0) & + + in(i+7,j-1) * (-0.027777777777777776d0) & + + in(i+8,j-1) * (-0.027777777777777776d0) & + + in(i+9,j-1) * (-0.027777777777777776d0) & + + in(i-9,j+1) * (-0.00018155410312273057d0) & + + in(i-8,j+1) * (-0.0002314814814814815d0) & + + in(i-7,j+1) * (-0.00030525030525030525d0) & + + in(i-6,j+1) * (-0.00042087542087542086d0) & + + in(i-5,j+1) * (-0.0006172839506172839d0) & + + in(i-4,j+1) * (-0.000992063492063492d0) & + + in(i-3,j+1) * (-0.001851851851851852d0) & + + in(i-2,j+1) * (-0.004629629629629629d0) & + + in(i-1,j+1) * (-0.027777777777777776d0) & + + in(i+1,j+1) * (0.027777777777777776d0) & + + in(i+2,j+1) * (0.004629629629629629d0) & + + in(i+3,j+1) * (0.001851851851851852d0) & + + in(i+4,j+1) * (0.000992063492063492d0) & + + in(i+5,j+1) * (0.0006172839506172839d0) & + + in(i+6,j+1) * (0.00042087542087542086d0) & + + in(i+7,j+1) * (0.00030525030525030525d0) & + + in(i+8,j+1) * (0.0002314814814814815d0) & + + in(i+9,j+1) * (0.00018155410312273057d0) & + + in(i-9,j+2) * (-0.00018155410312273057d0) & + + in(i-8,j+2) * (-0.0002314814814814815d0) & + + in(i-7,j+2) * (-0.00030525030525030525d0) & + + in(i-6,j+2) * (-0.00042087542087542086d0) & + + in(i-5,j+2) * (-0.0006172839506172839d0) & + + in(i-4,j+2) * (-0.000992063492063492d0) & + + in(i-3,j+2) * (-0.001851851851851852d0) & + + in(i-2,j+2) * (-0.004629629629629629d0) & + + in(i-1,j+2) * (-0.027777777777777776d0) & + + in(i+1,j+2) * (0.004629629629629629d0) & + + in(i+2,j+2) * (0.013888888888888888d0) & + + in(i+3,j+2) * (0.001851851851851852d0) & + + in(i+4,j+2) * (0.000992063492063492d0) & + + in(i+5,j+2) * (0.0006172839506172839d0) & + + in(i+6,j+2) * (0.00042087542087542086d0) & + + in(i+7,j+2) * (0.00030525030525030525d0) & + + in(i+8,j+2) * (0.0002314814814814815d0) & + + in(i+9,j+2) * (0.00018155410312273057d0) & + + in(i-9,j+3) * (-0.00018155410312273057d0) & + + in(i-8,j+3) * (-0.0002314814814814815d0) & + + in(i-7,j+3) * (-0.00030525030525030525d0) & + + in(i-6,j+3) * (-0.00042087542087542086d0) & + + in(i-5,j+3) * (-0.0006172839506172839d0) & + + in(i-4,j+3) * (-0.000992063492063492d0) & + + in(i-3,j+3) * (-0.001851851851851852d0) & + + in(i-2,j+3) * (-0.004629629629629629d0) & + + in(i-1,j+3) * (-0.027777777777777776d0) & + + in(i+1,j+3) * (0.001851851851851852d0) & + + in(i+2,j+3) * (0.001851851851851852d0) & + + in(i+3,j+3) * (0.009259259259259259d0) & + + in(i+4,j+3) * (0.000992063492063492d0) & + + in(i+5,j+3) * (0.0006172839506172839d0) & + + in(i+6,j+3) * (0.00042087542087542086d0) & + + in(i+7,j+3) * (0.00030525030525030525d0) & + + in(i+8,j+3) * (0.0002314814814814815d0) & + + in(i+9,j+3) * (0.00018155410312273057d0) & + + in(i-9,j+4) * (-0.00018155410312273057d0) & + + in(i-8,j+4) * (-0.0002314814814814815d0) & + + in(i-7,j+4) * (-0.00030525030525030525d0) & + + in(i-6,j+4) * (-0.00042087542087542086d0) & + + in(i-5,j+4) * (-0.0006172839506172839d0) & + + in(i-4,j+4) * (-0.000992063492063492d0) & + + in(i-3,j+4) * (-0.001851851851851852d0) & + + in(i-2,j+4) * (-0.004629629629629629d0) & + + in(i-1,j+4) * (-0.027777777777777776d0) & + + in(i+1,j+4) * (0.000992063492063492d0) & + + in(i+2,j+4) * (0.000992063492063492d0) & + + in(i+3,j+4) * (0.000992063492063492d0) & + + in(i+4,j+4) * (0.006944444444444444d0) & + + in(i+5,j+4) * (0.0006172839506172839d0) & + + in(i+6,j+4) * (0.00042087542087542086d0) & + + in(i+7,j+4) * (0.00030525030525030525d0) & + + in(i+8,j+4) * (0.0002314814814814815d0) & + + in(i+9,j+4) * (0.00018155410312273057d0) & + + in(i-9,j+5) * (-0.00018155410312273057d0) & + + in(i-8,j+5) * (-0.0002314814814814815d0) & + + in(i-7,j+5) * (-0.00030525030525030525d0) & + + in(i-6,j+5) * (-0.00042087542087542086d0) & + + in(i-5,j+5) * (-0.0006172839506172839d0) & + + in(i-4,j+5) * (-0.000992063492063492d0) & + + in(i-3,j+5) * (-0.001851851851851852d0) & + + in(i-2,j+5) * (-0.004629629629629629d0) & + + in(i-1,j+5) * (-0.027777777777777776d0) & + + in(i+1,j+5) * (0.0006172839506172839d0) & + + in(i+2,j+5) * (0.0006172839506172839d0) & + + in(i+3,j+5) * (0.0006172839506172839d0) & + + in(i+4,j+5) * (0.0006172839506172839d0) & + + in(i+5,j+5) * (0.005555555555555556d0) & + + in(i+6,j+5) * (0.00042087542087542086d0) & + + in(i+7,j+5) * (0.00030525030525030525d0) & + + in(i+8,j+5) * (0.0002314814814814815d0) & + + in(i+9,j+5) * (0.00018155410312273057d0) & + + in(i-9,j+6) * (-0.00018155410312273057d0) & + + in(i-8,j+6) * (-0.0002314814814814815d0) & + + in(i-7,j+6) * (-0.00030525030525030525d0) & + + in(i-6,j+6) * (-0.00042087542087542086d0) & + + in(i-5,j+6) * (-0.0006172839506172839d0) & + + in(i-4,j+6) * (-0.000992063492063492d0) & + + in(i-3,j+6) * (-0.001851851851851852d0) & + + in(i-2,j+6) * (-0.004629629629629629d0) & + + in(i-1,j+6) * (-0.027777777777777776d0) & + + in(i+1,j+6) * (0.00042087542087542086d0) & + + in(i+2,j+6) * (0.00042087542087542086d0) & + + in(i+3,j+6) * (0.00042087542087542086d0) & + + in(i+4,j+6) * (0.00042087542087542086d0) & + + in(i+5,j+6) * (0.00042087542087542086d0) & + + in(i+6,j+6) * (0.004629629629629629d0) & + + in(i+7,j+6) * (0.00030525030525030525d0) & + + in(i+8,j+6) * (0.0002314814814814815d0) & + + in(i+9,j+6) * (0.00018155410312273057d0) & + + in(i-9,j+7) * (-0.00018155410312273057d0) & + + in(i-8,j+7) * (-0.0002314814814814815d0) & + + in(i-7,j+7) * (-0.00030525030525030525d0) & + + in(i-6,j+7) * (-0.00042087542087542086d0) & + + in(i-5,j+7) * (-0.0006172839506172839d0) & + + in(i-4,j+7) * (-0.000992063492063492d0) & + + in(i-3,j+7) * (-0.001851851851851852d0) & + + in(i-2,j+7) * (-0.004629629629629629d0) & + + in(i-1,j+7) * (-0.027777777777777776d0) & + + in(i+1,j+7) * (0.00030525030525030525d0) & + + in(i+2,j+7) * (0.00030525030525030525d0) & + + in(i+3,j+7) * (0.00030525030525030525d0) & + + in(i+4,j+7) * (0.00030525030525030525d0) & + + in(i+5,j+7) * (0.00030525030525030525d0) & + + in(i+6,j+7) * (0.00030525030525030525d0) & + + in(i+7,j+7) * (0.003968253968253968d0) & + + in(i+8,j+7) * (0.0002314814814814815d0) & + + in(i+9,j+7) * (0.00018155410312273057d0) & + + in(i-9,j+8) * (-0.00018155410312273057d0) & + + in(i-8,j+8) * (-0.0002314814814814815d0) & + + in(i-7,j+8) * (-0.00030525030525030525d0) & + + in(i-6,j+8) * (-0.00042087542087542086d0) & + + in(i-5,j+8) * (-0.0006172839506172839d0) & + + in(i-4,j+8) * (-0.000992063492063492d0) & + + in(i-3,j+8) * (-0.001851851851851852d0) & + + in(i-2,j+8) * (-0.004629629629629629d0) & + + in(i-1,j+8) * (-0.027777777777777776d0) & + + in(i+1,j+8) * (0.0002314814814814815d0) & + + in(i+2,j+8) * (0.0002314814814814815d0) & + + in(i+3,j+8) * (0.0002314814814814815d0) & + + in(i+4,j+8) * (0.0002314814814814815d0) & + + in(i+5,j+8) * (0.0002314814814814815d0) & + + in(i+6,j+8) * (0.0002314814814814815d0) & + + in(i+7,j+8) * (0.0002314814814814815d0) & + + in(i+8,j+8) * (0.003472222222222222d0) & + + in(i+9,j+8) * (0.00018155410312273057d0) & + + in(i-9,j+9) * (-0.00018155410312273057d0) & + + in(i-8,j+9) * (-0.0002314814814814815d0) & + + in(i-7,j+9) * (-0.00030525030525030525d0) & + + in(i-6,j+9) * (-0.00042087542087542086d0) & + + in(i-5,j+9) * (-0.0006172839506172839d0) & + + in(i-4,j+9) * (-0.000992063492063492d0) & + + in(i-3,j+9) * (-0.001851851851851852d0) & + + in(i-2,j+9) * (-0.004629629629629629d0) & + + in(i-1,j+9) * (-0.027777777777777776d0) & + + in(i+1,j+9) * (0.00018155410312273057d0) & + + in(i+2,j+9) * (0.00018155410312273057d0) & + + in(i+3,j+9) * (0.00018155410312273057d0) & + + in(i+4,j+9) * (0.00018155410312273057d0) & + + in(i+5,j+9) * (0.00018155410312273057d0) & + + in(i+6,j+9) * (0.00018155410312273057d0) & + + in(i+7,j+9) * (0.00018155410312273057d0) & + + in(i+8,j+9) * (0.00018155410312273057d0) & + + in(i+9,j+9) * (0.0030864197530864196d0) & +0.0 end do !$omp end simd diff --git a/FORTRAN/stencil_taskloop.f90 b/FORTRAN/stencil_taskloop.f90 index 5111b5ec4..77735c322 100644 --- a/FORTRAN/stencil_taskloop.f90 +++ b/FORTRAN/stencil_taskloop.f90 @@ -10,10 +10,10 @@ subroutine star1(n, in, out) !$omp simd do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i+0,j-1) * (-0.5) & - + in(i-1,j+0) * (-0.5) & - + in(i+1,j+0) * (0.5) & - + in(i+0,j+1) * (0.5) & + + in(i+0,j-1) * (-0.5d0) & + + in(i-1,j+0) * (-0.5d0) & + + in(i+1,j+0) * (0.5d0) & + + in(i+0,j+1) * (0.5d0) & +0.0 end do !$omp end simd @@ -33,14 +33,14 @@ subroutine star2(n, in, out) !$omp simd do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i+0,j-2) * (-0.125) & - + in(i+0,j-1) * (-0.25) & - + in(i-2,j+0) * (-0.125) & - + in(i-1,j+0) * (-0.25) & - + in(i+1,j+0) * (0.25) & - + in(i+2,j+0) * (0.125) & - + in(i+0,j+1) * (0.25) & - + in(i+0,j+2) * (0.125) & + + in(i+0,j-2) * (-0.125d0) & + + in(i+0,j-1) * (-0.25d0) & + + in(i-2,j+0) * (-0.125d0) & + + in(i-1,j+0) * (-0.25d0) & + + in(i+1,j+0) * (0.25d0) & + + in(i+2,j+0) * (0.125d0) & + + in(i+0,j+1) * (0.25d0) & + + in(i+0,j+2) * (0.125d0) & +0.0 end do !$omp end simd @@ -60,18 +60,18 @@ subroutine star3(n, in, out) !$omp simd do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i+0,j-3) * (-0.05555555555555555) & - + in(i+0,j-2) * (-0.08333333333333333) & - + in(i+0,j-1) * (-0.16666666666666666) & - + in(i-3,j+0) * (-0.05555555555555555) & - + in(i-2,j+0) * (-0.08333333333333333) & - + in(i-1,j+0) * (-0.16666666666666666) & - + in(i+1,j+0) * (0.16666666666666666) & - + in(i+2,j+0) * (0.08333333333333333) & - + in(i+3,j+0) * (0.05555555555555555) & - + in(i+0,j+1) * (0.16666666666666666) & - + in(i+0,j+2) * (0.08333333333333333) & - + in(i+0,j+3) * (0.05555555555555555) & + + in(i+0,j-3) * (-0.05555555555555555d0) & + + in(i+0,j-2) * (-0.08333333333333333d0) & + + in(i+0,j-1) * (-0.16666666666666666d0) & + + in(i-3,j+0) * (-0.05555555555555555d0) & + + in(i-2,j+0) * (-0.08333333333333333d0) & + + in(i-1,j+0) * (-0.16666666666666666d0) & + + in(i+1,j+0) * (0.16666666666666666d0) & + + in(i+2,j+0) * (0.08333333333333333d0) & + + in(i+3,j+0) * (0.05555555555555555d0) & + + in(i+0,j+1) * (0.16666666666666666d0) & + + in(i+0,j+2) * (0.08333333333333333d0) & + + in(i+0,j+3) * (0.05555555555555555d0) & +0.0 end do !$omp end simd @@ -91,22 +91,22 @@ subroutine star4(n, in, out) !$omp simd do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i+0,j-4) * (-0.03125) & - + in(i+0,j-3) * (-0.041666666666666664) & - + in(i+0,j-2) * (-0.0625) & - + in(i+0,j-1) * (-0.125) & - + in(i-4,j+0) * (-0.03125) & - + in(i-3,j+0) * (-0.041666666666666664) & - + in(i-2,j+0) * (-0.0625) & - + in(i-1,j+0) * (-0.125) & - + in(i+1,j+0) * (0.125) & - + in(i+2,j+0) * (0.0625) & - + in(i+3,j+0) * (0.041666666666666664) & - + in(i+4,j+0) * (0.03125) & - + in(i+0,j+1) * (0.125) & - + in(i+0,j+2) * (0.0625) & - + in(i+0,j+3) * (0.041666666666666664) & - + in(i+0,j+4) * (0.03125) & + + in(i+0,j-4) * (-0.03125d0) & + + in(i+0,j-3) * (-0.041666666666666664d0) & + + in(i+0,j-2) * (-0.0625d0) & + + in(i+0,j-1) * (-0.125d0) & + + in(i-4,j+0) * (-0.03125d0) & + + in(i-3,j+0) * (-0.041666666666666664d0) & + + in(i-2,j+0) * (-0.0625d0) & + + in(i-1,j+0) * (-0.125d0) & + + in(i+1,j+0) * (0.125d0) & + + in(i+2,j+0) * (0.0625d0) & + + in(i+3,j+0) * (0.041666666666666664d0) & + + in(i+4,j+0) * (0.03125d0) & + + in(i+0,j+1) * (0.125d0) & + + in(i+0,j+2) * (0.0625d0) & + + in(i+0,j+3) * (0.041666666666666664d0) & + + in(i+0,j+4) * (0.03125d0) & +0.0 end do !$omp end simd @@ -126,26 +126,26 @@ subroutine star5(n, in, out) !$omp simd do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i+0,j-5) * (-0.02) & - + in(i+0,j-4) * (-0.025) & - + in(i+0,j-3) * (-0.03333333333333333) & - + in(i+0,j-2) * (-0.05) & - + in(i+0,j-1) * (-0.1) & - + in(i-5,j+0) * (-0.02) & - + in(i-4,j+0) * (-0.025) & - + in(i-3,j+0) * (-0.03333333333333333) & - + in(i-2,j+0) * (-0.05) & - + in(i-1,j+0) * (-0.1) & - + in(i+1,j+0) * (0.1) & - + in(i+2,j+0) * (0.05) & - + in(i+3,j+0) * (0.03333333333333333) & - + in(i+4,j+0) * (0.025) & - + in(i+5,j+0) * (0.02) & - + in(i+0,j+1) * (0.1) & - + in(i+0,j+2) * (0.05) & - + in(i+0,j+3) * (0.03333333333333333) & - + in(i+0,j+4) * (0.025) & - + in(i+0,j+5) * (0.02) & + + in(i+0,j-5) * (-0.02d0) & + + in(i+0,j-4) * (-0.025d0) & + + in(i+0,j-3) * (-0.03333333333333333d0) & + + in(i+0,j-2) * (-0.05d0) & + + in(i+0,j-1) * (-0.1d0) & + + in(i-5,j+0) * (-0.02d0) & + + in(i-4,j+0) * (-0.025d0) & + + in(i-3,j+0) * (-0.03333333333333333d0) & + + in(i-2,j+0) * (-0.05d0) & + + in(i-1,j+0) * (-0.1d0) & + + in(i+1,j+0) * (0.1d0) & + + in(i+2,j+0) * (0.05d0) & + + in(i+3,j+0) * (0.03333333333333333d0) & + + in(i+4,j+0) * (0.025d0) & + + in(i+5,j+0) * (0.02d0) & + + in(i+0,j+1) * (0.1d0) & + + in(i+0,j+2) * (0.05d0) & + + in(i+0,j+3) * (0.03333333333333333d0) & + + in(i+0,j+4) * (0.025d0) & + + in(i+0,j+5) * (0.02d0) & +0.0 end do !$omp end simd @@ -165,30 +165,30 @@ subroutine star6(n, in, out) !$omp simd do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i+0,j-6) * (-0.013888888888888888) & - + in(i+0,j-5) * (-0.016666666666666666) & - + in(i+0,j-4) * (-0.020833333333333332) & - + in(i+0,j-3) * (-0.027777777777777776) & - + in(i+0,j-2) * (-0.041666666666666664) & - + in(i+0,j-1) * (-0.08333333333333333) & - + in(i-6,j+0) * (-0.013888888888888888) & - + in(i-5,j+0) * (-0.016666666666666666) & - + in(i-4,j+0) * (-0.020833333333333332) & - + in(i-3,j+0) * (-0.027777777777777776) & - + in(i-2,j+0) * (-0.041666666666666664) & - + in(i-1,j+0) * (-0.08333333333333333) & - + in(i+1,j+0) * (0.08333333333333333) & - + in(i+2,j+0) * (0.041666666666666664) & - + in(i+3,j+0) * (0.027777777777777776) & - + in(i+4,j+0) * (0.020833333333333332) & - + in(i+5,j+0) * (0.016666666666666666) & - + in(i+6,j+0) * (0.013888888888888888) & - + in(i+0,j+1) * (0.08333333333333333) & - + in(i+0,j+2) * (0.041666666666666664) & - + in(i+0,j+3) * (0.027777777777777776) & - + in(i+0,j+4) * (0.020833333333333332) & - + in(i+0,j+5) * (0.016666666666666666) & - + in(i+0,j+6) * (0.013888888888888888) & + + in(i+0,j-6) * (-0.013888888888888888d0) & + + in(i+0,j-5) * (-0.016666666666666666d0) & + + in(i+0,j-4) * (-0.020833333333333332d0) & + + in(i+0,j-3) * (-0.027777777777777776d0) & + + in(i+0,j-2) * (-0.041666666666666664d0) & + + in(i+0,j-1) * (-0.08333333333333333d0) & + + in(i-6,j+0) * (-0.013888888888888888d0) & + + in(i-5,j+0) * (-0.016666666666666666d0) & + + in(i-4,j+0) * (-0.020833333333333332d0) & + + in(i-3,j+0) * (-0.027777777777777776d0) & + + in(i-2,j+0) * (-0.041666666666666664d0) & + + in(i-1,j+0) * (-0.08333333333333333d0) & + + in(i+1,j+0) * (0.08333333333333333d0) & + + in(i+2,j+0) * (0.041666666666666664d0) & + + in(i+3,j+0) * (0.027777777777777776d0) & + + in(i+4,j+0) * (0.020833333333333332d0) & + + in(i+5,j+0) * (0.016666666666666666d0) & + + in(i+6,j+0) * (0.013888888888888888d0) & + + in(i+0,j+1) * (0.08333333333333333d0) & + + in(i+0,j+2) * (0.041666666666666664d0) & + + in(i+0,j+3) * (0.027777777777777776d0) & + + in(i+0,j+4) * (0.020833333333333332d0) & + + in(i+0,j+5) * (0.016666666666666666d0) & + + in(i+0,j+6) * (0.013888888888888888d0) & +0.0 end do !$omp end simd @@ -208,34 +208,34 @@ subroutine star7(n, in, out) !$omp simd do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i+0,j-7) * (-0.01020408163265306) & - + in(i+0,j-6) * (-0.011904761904761904) & - + in(i+0,j-5) * (-0.014285714285714285) & - + in(i+0,j-4) * (-0.017857142857142856) & - + in(i+0,j-3) * (-0.023809523809523808) & - + in(i+0,j-2) * (-0.03571428571428571) & - + in(i+0,j-1) * (-0.07142857142857142) & - + in(i-7,j+0) * (-0.01020408163265306) & - + in(i-6,j+0) * (-0.011904761904761904) & - + in(i-5,j+0) * (-0.014285714285714285) & - + in(i-4,j+0) * (-0.017857142857142856) & - + in(i-3,j+0) * (-0.023809523809523808) & - + in(i-2,j+0) * (-0.03571428571428571) & - + in(i-1,j+0) * (-0.07142857142857142) & - + in(i+1,j+0) * (0.07142857142857142) & - + in(i+2,j+0) * (0.03571428571428571) & - + in(i+3,j+0) * (0.023809523809523808) & - + in(i+4,j+0) * (0.017857142857142856) & - + in(i+5,j+0) * (0.014285714285714285) & - + in(i+6,j+0) * (0.011904761904761904) & - + in(i+7,j+0) * (0.01020408163265306) & - + in(i+0,j+1) * (0.07142857142857142) & - + in(i+0,j+2) * (0.03571428571428571) & - + in(i+0,j+3) * (0.023809523809523808) & - + in(i+0,j+4) * (0.017857142857142856) & - + in(i+0,j+5) * (0.014285714285714285) & - + in(i+0,j+6) * (0.011904761904761904) & - + in(i+0,j+7) * (0.01020408163265306) & + + in(i+0,j-7) * (-0.01020408163265306d0) & + + in(i+0,j-6) * (-0.011904761904761904d0) & + + in(i+0,j-5) * (-0.014285714285714285d0) & + + in(i+0,j-4) * (-0.017857142857142856d0) & + + in(i+0,j-3) * (-0.023809523809523808d0) & + + in(i+0,j-2) * (-0.03571428571428571d0) & + + in(i+0,j-1) * (-0.07142857142857142d0) & + + in(i-7,j+0) * (-0.01020408163265306d0) & + + in(i-6,j+0) * (-0.011904761904761904d0) & + + in(i-5,j+0) * (-0.014285714285714285d0) & + + in(i-4,j+0) * (-0.017857142857142856d0) & + + in(i-3,j+0) * (-0.023809523809523808d0) & + + in(i-2,j+0) * (-0.03571428571428571d0) & + + in(i-1,j+0) * (-0.07142857142857142d0) & + + in(i+1,j+0) * (0.07142857142857142d0) & + + in(i+2,j+0) * (0.03571428571428571d0) & + + in(i+3,j+0) * (0.023809523809523808d0) & + + in(i+4,j+0) * (0.017857142857142856d0) & + + in(i+5,j+0) * (0.014285714285714285d0) & + + in(i+6,j+0) * (0.011904761904761904d0) & + + in(i+7,j+0) * (0.01020408163265306d0) & + + in(i+0,j+1) * (0.07142857142857142d0) & + + in(i+0,j+2) * (0.03571428571428571d0) & + + in(i+0,j+3) * (0.023809523809523808d0) & + + in(i+0,j+4) * (0.017857142857142856d0) & + + in(i+0,j+5) * (0.014285714285714285d0) & + + in(i+0,j+6) * (0.011904761904761904d0) & + + in(i+0,j+7) * (0.01020408163265306d0) & +0.0 end do !$omp end simd @@ -255,38 +255,38 @@ subroutine star8(n, in, out) !$omp simd do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i+0,j-8) * (-0.0078125) & - + in(i+0,j-7) * (-0.008928571428571428) & - + in(i+0,j-6) * (-0.010416666666666666) & - + in(i+0,j-5) * (-0.0125) & - + in(i+0,j-4) * (-0.015625) & - + in(i+0,j-3) * (-0.020833333333333332) & - + in(i+0,j-2) * (-0.03125) & - + in(i+0,j-1) * (-0.0625) & - + in(i-8,j+0) * (-0.0078125) & - + in(i-7,j+0) * (-0.008928571428571428) & - + in(i-6,j+0) * (-0.010416666666666666) & - + in(i-5,j+0) * (-0.0125) & - + in(i-4,j+0) * (-0.015625) & - + in(i-3,j+0) * (-0.020833333333333332) & - + in(i-2,j+0) * (-0.03125) & - + in(i-1,j+0) * (-0.0625) & - + in(i+1,j+0) * (0.0625) & - + in(i+2,j+0) * (0.03125) & - + in(i+3,j+0) * (0.020833333333333332) & - + in(i+4,j+0) * (0.015625) & - + in(i+5,j+0) * (0.0125) & - + in(i+6,j+0) * (0.010416666666666666) & - + in(i+7,j+0) * (0.008928571428571428) & - + in(i+8,j+0) * (0.0078125) & - + in(i+0,j+1) * (0.0625) & - + in(i+0,j+2) * (0.03125) & - + in(i+0,j+3) * (0.020833333333333332) & - + in(i+0,j+4) * (0.015625) & - + in(i+0,j+5) * (0.0125) & - + in(i+0,j+6) * (0.010416666666666666) & - + in(i+0,j+7) * (0.008928571428571428) & - + in(i+0,j+8) * (0.0078125) & + + in(i+0,j-8) * (-0.0078125d0) & + + in(i+0,j-7) * (-0.008928571428571428d0) & + + in(i+0,j-6) * (-0.010416666666666666d0) & + + in(i+0,j-5) * (-0.0125d0) & + + in(i+0,j-4) * (-0.015625d0) & + + in(i+0,j-3) * (-0.020833333333333332d0) & + + in(i+0,j-2) * (-0.03125d0) & + + in(i+0,j-1) * (-0.0625d0) & + + in(i-8,j+0) * (-0.0078125d0) & + + in(i-7,j+0) * (-0.008928571428571428d0) & + + in(i-6,j+0) * (-0.010416666666666666d0) & + + in(i-5,j+0) * (-0.0125d0) & + + in(i-4,j+0) * (-0.015625d0) & + + in(i-3,j+0) * (-0.020833333333333332d0) & + + in(i-2,j+0) * (-0.03125d0) & + + in(i-1,j+0) * (-0.0625d0) & + + in(i+1,j+0) * (0.0625d0) & + + in(i+2,j+0) * (0.03125d0) & + + in(i+3,j+0) * (0.020833333333333332d0) & + + in(i+4,j+0) * (0.015625d0) & + + in(i+5,j+0) * (0.0125d0) & + + in(i+6,j+0) * (0.010416666666666666d0) & + + in(i+7,j+0) * (0.008928571428571428d0) & + + in(i+8,j+0) * (0.0078125d0) & + + in(i+0,j+1) * (0.0625d0) & + + in(i+0,j+2) * (0.03125d0) & + + in(i+0,j+3) * (0.020833333333333332d0) & + + in(i+0,j+4) * (0.015625d0) & + + in(i+0,j+5) * (0.0125d0) & + + in(i+0,j+6) * (0.010416666666666666d0) & + + in(i+0,j+7) * (0.008928571428571428d0) & + + in(i+0,j+8) * (0.0078125d0) & +0.0 end do !$omp end simd @@ -306,42 +306,42 @@ subroutine star9(n, in, out) !$omp simd do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i+0,j-9) * (-0.006172839506172839) & - + in(i+0,j-8) * (-0.006944444444444444) & - + in(i+0,j-7) * (-0.007936507936507936) & - + in(i+0,j-6) * (-0.009259259259259259) & - + in(i+0,j-5) * (-0.011111111111111112) & - + in(i+0,j-4) * (-0.013888888888888888) & - + in(i+0,j-3) * (-0.018518518518518517) & - + in(i+0,j-2) * (-0.027777777777777776) & - + in(i+0,j-1) * (-0.05555555555555555) & - + in(i-9,j+0) * (-0.006172839506172839) & - + in(i-8,j+0) * (-0.006944444444444444) & - + in(i-7,j+0) * (-0.007936507936507936) & - + in(i-6,j+0) * (-0.009259259259259259) & - + in(i-5,j+0) * (-0.011111111111111112) & - + in(i-4,j+0) * (-0.013888888888888888) & - + in(i-3,j+0) * (-0.018518518518518517) & - + in(i-2,j+0) * (-0.027777777777777776) & - + in(i-1,j+0) * (-0.05555555555555555) & - + in(i+1,j+0) * (0.05555555555555555) & - + in(i+2,j+0) * (0.027777777777777776) & - + in(i+3,j+0) * (0.018518518518518517) & - + in(i+4,j+0) * (0.013888888888888888) & - + in(i+5,j+0) * (0.011111111111111112) & - + in(i+6,j+0) * (0.009259259259259259) & - + in(i+7,j+0) * (0.007936507936507936) & - + in(i+8,j+0) * (0.006944444444444444) & - + in(i+9,j+0) * (0.006172839506172839) & - + in(i+0,j+1) * (0.05555555555555555) & - + in(i+0,j+2) * (0.027777777777777776) & - + in(i+0,j+3) * (0.018518518518518517) & - + in(i+0,j+4) * (0.013888888888888888) & - + in(i+0,j+5) * (0.011111111111111112) & - + in(i+0,j+6) * (0.009259259259259259) & - + in(i+0,j+7) * (0.007936507936507936) & - + in(i+0,j+8) * (0.006944444444444444) & - + in(i+0,j+9) * (0.006172839506172839) & + + in(i+0,j-9) * (-0.006172839506172839d0) & + + in(i+0,j-8) * (-0.006944444444444444d0) & + + in(i+0,j-7) * (-0.007936507936507936d0) & + + in(i+0,j-6) * (-0.009259259259259259d0) & + + in(i+0,j-5) * (-0.011111111111111112d0) & + + in(i+0,j-4) * (-0.013888888888888888d0) & + + in(i+0,j-3) * (-0.018518518518518517d0) & + + in(i+0,j-2) * (-0.027777777777777776d0) & + + in(i+0,j-1) * (-0.05555555555555555d0) & + + in(i-9,j+0) * (-0.006172839506172839d0) & + + in(i-8,j+0) * (-0.006944444444444444d0) & + + in(i-7,j+0) * (-0.007936507936507936d0) & + + in(i-6,j+0) * (-0.009259259259259259d0) & + + in(i-5,j+0) * (-0.011111111111111112d0) & + + in(i-4,j+0) * (-0.013888888888888888d0) & + + in(i-3,j+0) * (-0.018518518518518517d0) & + + in(i-2,j+0) * (-0.027777777777777776d0) & + + in(i-1,j+0) * (-0.05555555555555555d0) & + + in(i+1,j+0) * (0.05555555555555555d0) & + + in(i+2,j+0) * (0.027777777777777776d0) & + + in(i+3,j+0) * (0.018518518518518517d0) & + + in(i+4,j+0) * (0.013888888888888888d0) & + + in(i+5,j+0) * (0.011111111111111112d0) & + + in(i+6,j+0) * (0.009259259259259259d0) & + + in(i+7,j+0) * (0.007936507936507936d0) & + + in(i+8,j+0) * (0.006944444444444444d0) & + + in(i+9,j+0) * (0.006172839506172839d0) & + + in(i+0,j+1) * (0.05555555555555555d0) & + + in(i+0,j+2) * (0.027777777777777776d0) & + + in(i+0,j+3) * (0.018518518518518517d0) & + + in(i+0,j+4) * (0.013888888888888888d0) & + + in(i+0,j+5) * (0.011111111111111112d0) & + + in(i+0,j+6) * (0.009259259259259259d0) & + + in(i+0,j+7) * (0.007936507936507936d0) & + + in(i+0,j+8) * (0.006944444444444444d0) & + + in(i+0,j+9) * (0.006172839506172839d0) & +0.0 end do !$omp end simd @@ -361,10 +361,10 @@ subroutine grid1(n, in, out) !$omp simd do j=1,n-1-1 out(i,j) = out(i,j) & - + in(i-1,j-1) * (-0.25) & - + in(i+1,j-1) * (-0.25) & - + in(i-1,j+1) * (-0.25) & - + in(i+1,j+1) * (0.25) & + + in(i-1,j-1) * (-0.25d0) & + + in(i+1,j-1) * (-0.25d0) & + + in(i-1,j+1) * (-0.25d0) & + + in(i+1,j+1) * (0.25d0) & +0.0 end do !$omp end simd @@ -384,20 +384,20 @@ subroutine grid2(n, in, out) !$omp simd do j=2,n-2-1 out(i,j) = out(i,j) & - + in(i-2,j-2) * (-0.0625) & - + in(i+1,j-2) * (-0.020833333333333332) & - + in(i+2,j-2) * (-0.020833333333333332) & - + in(i-1,j-1) * (-0.125) & - + in(i+1,j-1) * (-0.125) & - + in(i+2,j-1) * (-0.125) & - + in(i-2,j+1) * (-0.020833333333333332) & - + in(i-1,j+1) * (-0.125) & - + in(i+1,j+1) * (0.125) & - + in(i+2,j+1) * (0.020833333333333332) & - + in(i-2,j+2) * (-0.020833333333333332) & - + in(i-1,j+2) * (-0.125) & - + in(i+1,j+2) * (0.020833333333333332) & - + in(i+2,j+2) * (0.0625) & + + in(i-2,j-2) * (-0.0625d0) & + + in(i+1,j-2) * (-0.020833333333333332d0) & + + in(i+2,j-2) * (-0.020833333333333332d0) & + + in(i-1,j-1) * (-0.125d0) & + + in(i+1,j-1) * (-0.125d0) & + + in(i+2,j-1) * (-0.125d0) & + + in(i-2,j+1) * (-0.020833333333333332d0) & + + in(i-1,j+1) * (-0.125d0) & + + in(i+1,j+1) * (0.125d0) & + + in(i+2,j+1) * (0.020833333333333332d0) & + + in(i-2,j+2) * (-0.020833333333333332d0) & + + in(i-1,j+2) * (-0.125d0) & + + in(i+1,j+2) * (0.020833333333333332d0) & + + in(i+2,j+2) * (0.0625d0) & +0.0 end do !$omp end simd @@ -417,36 +417,36 @@ subroutine grid3(n, in, out) !$omp simd do j=3,n-3-1 out(i,j) = out(i,j) & - + in(i-3,j-3) * (-0.027777777777777776) & - + in(i+1,j-3) * (-0.005555555555555556) & - + in(i+2,j-3) * (-0.005555555555555556) & - + in(i+3,j-3) * (-0.005555555555555556) & - + in(i-2,j-2) * (-0.041666666666666664) & - + in(i+1,j-2) * (-0.013888888888888888) & - + in(i+2,j-2) * (-0.013888888888888888) & - + in(i+3,j-2) * (-0.013888888888888888) & - + in(i-1,j-1) * (-0.08333333333333333) & - + in(i+1,j-1) * (-0.08333333333333333) & - + in(i+2,j-1) * (-0.08333333333333333) & - + in(i+3,j-1) * (-0.08333333333333333) & - + in(i-3,j+1) * (-0.005555555555555556) & - + in(i-2,j+1) * (-0.013888888888888888) & - + in(i-1,j+1) * (-0.08333333333333333) & - + in(i+1,j+1) * (0.08333333333333333) & - + in(i+2,j+1) * (0.013888888888888888) & - + in(i+3,j+1) * (0.005555555555555556) & - + in(i-3,j+2) * (-0.005555555555555556) & - + in(i-2,j+2) * (-0.013888888888888888) & - + in(i-1,j+2) * (-0.08333333333333333) & - + in(i+1,j+2) * (0.013888888888888888) & - + in(i+2,j+2) * (0.041666666666666664) & - + in(i+3,j+2) * (0.005555555555555556) & - + in(i-3,j+3) * (-0.005555555555555556) & - + in(i-2,j+3) * (-0.013888888888888888) & - + in(i-1,j+3) * (-0.08333333333333333) & - + in(i+1,j+3) * (0.005555555555555556) & - + in(i+2,j+3) * (0.005555555555555556) & - + in(i+3,j+3) * (0.027777777777777776) & + + in(i-3,j-3) * (-0.027777777777777776d0) & + + in(i+1,j-3) * (-0.005555555555555556d0) & + + in(i+2,j-3) * (-0.005555555555555556d0) & + + in(i+3,j-3) * (-0.005555555555555556d0) & + + in(i-2,j-2) * (-0.041666666666666664d0) & + + in(i+1,j-2) * (-0.013888888888888888d0) & + + in(i+2,j-2) * (-0.013888888888888888d0) & + + in(i+3,j-2) * (-0.013888888888888888d0) & + + in(i-1,j-1) * (-0.08333333333333333d0) & + + in(i+1,j-1) * (-0.08333333333333333d0) & + + in(i+2,j-1) * (-0.08333333333333333d0) & + + in(i+3,j-1) * (-0.08333333333333333d0) & + + in(i-3,j+1) * (-0.005555555555555556d0) & + + in(i-2,j+1) * (-0.013888888888888888d0) & + + in(i-1,j+1) * (-0.08333333333333333d0) & + + in(i+1,j+1) * (0.08333333333333333d0) & + + in(i+2,j+1) * (0.013888888888888888d0) & + + in(i+3,j+1) * (0.005555555555555556d0) & + + in(i-3,j+2) * (-0.005555555555555556d0) & + + in(i-2,j+2) * (-0.013888888888888888d0) & + + in(i-1,j+2) * (-0.08333333333333333d0) & + + in(i+1,j+2) * (0.013888888888888888d0) & + + in(i+2,j+2) * (0.041666666666666664d0) & + + in(i+3,j+2) * (0.005555555555555556d0) & + + in(i-3,j+3) * (-0.005555555555555556d0) & + + in(i-2,j+3) * (-0.013888888888888888d0) & + + in(i-1,j+3) * (-0.08333333333333333d0) & + + in(i+1,j+3) * (0.005555555555555556d0) & + + in(i+2,j+3) * (0.005555555555555556d0) & + + in(i+3,j+3) * (0.027777777777777776d0) & +0.0 end do !$omp end simd @@ -466,58 +466,58 @@ subroutine grid4(n, in, out) !$omp simd do j=4,n-4-1 out(i,j) = out(i,j) & - + in(i-4,j-4) * (-0.015625) & - + in(i+1,j-4) * (-0.002232142857142857) & - + in(i+2,j-4) * (-0.002232142857142857) & - + in(i+3,j-4) * (-0.002232142857142857) & - + in(i+4,j-4) * (-0.002232142857142857) & - + in(i-3,j-3) * (-0.020833333333333332) & - + in(i+1,j-3) * (-0.004166666666666667) & - + in(i+2,j-3) * (-0.004166666666666667) & - + in(i+3,j-3) * (-0.004166666666666667) & - + in(i+4,j-3) * (-0.004166666666666667) & - + in(i-2,j-2) * (-0.03125) & - + in(i+1,j-2) * (-0.010416666666666666) & - + in(i+2,j-2) * (-0.010416666666666666) & - + in(i+3,j-2) * (-0.010416666666666666) & - + in(i+4,j-2) * (-0.010416666666666666) & - + in(i-1,j-1) * (-0.0625) & - + in(i+1,j-1) * (-0.0625) & - + in(i+2,j-1) * (-0.0625) & - + in(i+3,j-1) * (-0.0625) & - + in(i+4,j-1) * (-0.0625) & - + in(i-4,j+1) * (-0.002232142857142857) & - + in(i-3,j+1) * (-0.004166666666666667) & - + in(i-2,j+1) * (-0.010416666666666666) & - + in(i-1,j+1) * (-0.0625) & - + in(i+1,j+1) * (0.0625) & - + in(i+2,j+1) * (0.010416666666666666) & - + in(i+3,j+1) * (0.004166666666666667) & - + in(i+4,j+1) * (0.002232142857142857) & - + in(i-4,j+2) * (-0.002232142857142857) & - + in(i-3,j+2) * (-0.004166666666666667) & - + in(i-2,j+2) * (-0.010416666666666666) & - + in(i-1,j+2) * (-0.0625) & - + in(i+1,j+2) * (0.010416666666666666) & - + in(i+2,j+2) * (0.03125) & - + in(i+3,j+2) * (0.004166666666666667) & - + in(i+4,j+2) * (0.002232142857142857) & - + in(i-4,j+3) * (-0.002232142857142857) & - + in(i-3,j+3) * (-0.004166666666666667) & - + in(i-2,j+3) * (-0.010416666666666666) & - + in(i-1,j+3) * (-0.0625) & - + in(i+1,j+3) * (0.004166666666666667) & - + in(i+2,j+3) * (0.004166666666666667) & - + in(i+3,j+3) * (0.020833333333333332) & - + in(i+4,j+3) * (0.002232142857142857) & - + in(i-4,j+4) * (-0.002232142857142857) & - + in(i-3,j+4) * (-0.004166666666666667) & - + in(i-2,j+4) * (-0.010416666666666666) & - + in(i-1,j+4) * (-0.0625) & - + in(i+1,j+4) * (0.002232142857142857) & - + in(i+2,j+4) * (0.002232142857142857) & - + in(i+3,j+4) * (0.002232142857142857) & - + in(i+4,j+4) * (0.015625) & + + in(i-4,j-4) * (-0.015625d0) & + + in(i+1,j-4) * (-0.002232142857142857d0) & + + in(i+2,j-4) * (-0.002232142857142857d0) & + + in(i+3,j-4) * (-0.002232142857142857d0) & + + in(i+4,j-4) * (-0.002232142857142857d0) & + + in(i-3,j-3) * (-0.020833333333333332d0) & + + in(i+1,j-3) * (-0.004166666666666667d0) & + + in(i+2,j-3) * (-0.004166666666666667d0) & + + in(i+3,j-3) * (-0.004166666666666667d0) & + + in(i+4,j-3) * (-0.004166666666666667d0) & + + in(i-2,j-2) * (-0.03125d0) & + + in(i+1,j-2) * (-0.010416666666666666d0) & + + in(i+2,j-2) * (-0.010416666666666666d0) & + + in(i+3,j-2) * (-0.010416666666666666d0) & + + in(i+4,j-2) * (-0.010416666666666666d0) & + + in(i-1,j-1) * (-0.0625d0) & + + in(i+1,j-1) * (-0.0625d0) & + + in(i+2,j-1) * (-0.0625d0) & + + in(i+3,j-1) * (-0.0625d0) & + + in(i+4,j-1) * (-0.0625d0) & + + in(i-4,j+1) * (-0.002232142857142857d0) & + + in(i-3,j+1) * (-0.004166666666666667d0) & + + in(i-2,j+1) * (-0.010416666666666666d0) & + + in(i-1,j+1) * (-0.0625d0) & + + in(i+1,j+1) * (0.0625d0) & + + in(i+2,j+1) * (0.010416666666666666d0) & + + in(i+3,j+1) * (0.004166666666666667d0) & + + in(i+4,j+1) * (0.002232142857142857d0) & + + in(i-4,j+2) * (-0.002232142857142857d0) & + + in(i-3,j+2) * (-0.004166666666666667d0) & + + in(i-2,j+2) * (-0.010416666666666666d0) & + + in(i-1,j+2) * (-0.0625d0) & + + in(i+1,j+2) * (0.010416666666666666d0) & + + in(i+2,j+2) * (0.03125d0) & + + in(i+3,j+2) * (0.004166666666666667d0) & + + in(i+4,j+2) * (0.002232142857142857d0) & + + in(i-4,j+3) * (-0.002232142857142857d0) & + + in(i-3,j+3) * (-0.004166666666666667d0) & + + in(i-2,j+3) * (-0.010416666666666666d0) & + + in(i-1,j+3) * (-0.0625d0) & + + in(i+1,j+3) * (0.004166666666666667d0) & + + in(i+2,j+3) * (0.004166666666666667d0) & + + in(i+3,j+3) * (0.020833333333333332d0) & + + in(i+4,j+3) * (0.002232142857142857d0) & + + in(i-4,j+4) * (-0.002232142857142857d0) & + + in(i-3,j+4) * (-0.004166666666666667d0) & + + in(i-2,j+4) * (-0.010416666666666666d0) & + + in(i-1,j+4) * (-0.0625d0) & + + in(i+1,j+4) * (0.002232142857142857d0) & + + in(i+2,j+4) * (0.002232142857142857d0) & + + in(i+3,j+4) * (0.002232142857142857d0) & + + in(i+4,j+4) * (0.015625d0) & +0.0 end do !$omp end simd @@ -537,86 +537,86 @@ subroutine grid5(n, in, out) !$omp simd do j=5,n-5-1 out(i,j) = out(i,j) & - + in(i-5,j-5) * (-0.01) & - + in(i+1,j-5) * (-0.0011111111111111111) & - + in(i+2,j-5) * (-0.0011111111111111111) & - + in(i+3,j-5) * (-0.0011111111111111111) & - + in(i+4,j-5) * (-0.0011111111111111111) & - + in(i+5,j-5) * (-0.0011111111111111111) & - + in(i-4,j-4) * (-0.0125) & - + in(i+1,j-4) * (-0.0017857142857142857) & - + in(i+2,j-4) * (-0.0017857142857142857) & - + in(i+3,j-4) * (-0.0017857142857142857) & - + in(i+4,j-4) * (-0.0017857142857142857) & - + in(i+5,j-4) * (-0.0017857142857142857) & - + in(i-3,j-3) * (-0.016666666666666666) & - + in(i+1,j-3) * (-0.0033333333333333335) & - + in(i+2,j-3) * (-0.0033333333333333335) & - + in(i+3,j-3) * (-0.0033333333333333335) & - + in(i+4,j-3) * (-0.0033333333333333335) & - + in(i+5,j-3) * (-0.0033333333333333335) & - + in(i-2,j-2) * (-0.025) & - + in(i+1,j-2) * (-0.008333333333333333) & - + in(i+2,j-2) * (-0.008333333333333333) & - + in(i+3,j-2) * (-0.008333333333333333) & - + in(i+4,j-2) * (-0.008333333333333333) & - + in(i+5,j-2) * (-0.008333333333333333) & - + in(i-1,j-1) * (-0.05) & - + in(i+1,j-1) * (-0.05) & - + in(i+2,j-1) * (-0.05) & - + in(i+3,j-1) * (-0.05) & - + in(i+4,j-1) * (-0.05) & - + in(i+5,j-1) * (-0.05) & - + in(i-5,j+1) * (-0.0011111111111111111) & - + in(i-4,j+1) * (-0.0017857142857142857) & - + in(i-3,j+1) * (-0.0033333333333333335) & - + in(i-2,j+1) * (-0.008333333333333333) & - + in(i-1,j+1) * (-0.05) & - + in(i+1,j+1) * (0.05) & - + in(i+2,j+1) * (0.008333333333333333) & - + in(i+3,j+1) * (0.0033333333333333335) & - + in(i+4,j+1) * (0.0017857142857142857) & - + in(i+5,j+1) * (0.0011111111111111111) & - + in(i-5,j+2) * (-0.0011111111111111111) & - + in(i-4,j+2) * (-0.0017857142857142857) & - + in(i-3,j+2) * (-0.0033333333333333335) & - + in(i-2,j+2) * (-0.008333333333333333) & - + in(i-1,j+2) * (-0.05) & - + in(i+1,j+2) * (0.008333333333333333) & - + in(i+2,j+2) * (0.025) & - + in(i+3,j+2) * (0.0033333333333333335) & - + in(i+4,j+2) * (0.0017857142857142857) & - + in(i+5,j+2) * (0.0011111111111111111) & - + in(i-5,j+3) * (-0.0011111111111111111) & - + in(i-4,j+3) * (-0.0017857142857142857) & - + in(i-3,j+3) * (-0.0033333333333333335) & - + in(i-2,j+3) * (-0.008333333333333333) & - + in(i-1,j+3) * (-0.05) & - + in(i+1,j+3) * (0.0033333333333333335) & - + in(i+2,j+3) * (0.0033333333333333335) & - + in(i+3,j+3) * (0.016666666666666666) & - + in(i+4,j+3) * (0.0017857142857142857) & - + in(i+5,j+3) * (0.0011111111111111111) & - + in(i-5,j+4) * (-0.0011111111111111111) & - + in(i-4,j+4) * (-0.0017857142857142857) & - + in(i-3,j+4) * (-0.0033333333333333335) & - + in(i-2,j+4) * (-0.008333333333333333) & - + in(i-1,j+4) * (-0.05) & - + in(i+1,j+4) * (0.0017857142857142857) & - + in(i+2,j+4) * (0.0017857142857142857) & - + in(i+3,j+4) * (0.0017857142857142857) & - + in(i+4,j+4) * (0.0125) & - + in(i+5,j+4) * (0.0011111111111111111) & - + in(i-5,j+5) * (-0.0011111111111111111) & - + in(i-4,j+5) * (-0.0017857142857142857) & - + in(i-3,j+5) * (-0.0033333333333333335) & - + in(i-2,j+5) * (-0.008333333333333333) & - + in(i-1,j+5) * (-0.05) & - + in(i+1,j+5) * (0.0011111111111111111) & - + in(i+2,j+5) * (0.0011111111111111111) & - + in(i+3,j+5) * (0.0011111111111111111) & - + in(i+4,j+5) * (0.0011111111111111111) & - + in(i+5,j+5) * (0.01) & + + in(i-5,j-5) * (-0.01d0) & + + in(i+1,j-5) * (-0.0011111111111111111d0) & + + in(i+2,j-5) * (-0.0011111111111111111d0) & + + in(i+3,j-5) * (-0.0011111111111111111d0) & + + in(i+4,j-5) * (-0.0011111111111111111d0) & + + in(i+5,j-5) * (-0.0011111111111111111d0) & + + in(i-4,j-4) * (-0.0125d0) & + + in(i+1,j-4) * (-0.0017857142857142857d0) & + + in(i+2,j-4) * (-0.0017857142857142857d0) & + + in(i+3,j-4) * (-0.0017857142857142857d0) & + + in(i+4,j-4) * (-0.0017857142857142857d0) & + + in(i+5,j-4) * (-0.0017857142857142857d0) & + + in(i-3,j-3) * (-0.016666666666666666d0) & + + in(i+1,j-3) * (-0.0033333333333333335d0) & + + in(i+2,j-3) * (-0.0033333333333333335d0) & + + in(i+3,j-3) * (-0.0033333333333333335d0) & + + in(i+4,j-3) * (-0.0033333333333333335d0) & + + in(i+5,j-3) * (-0.0033333333333333335d0) & + + in(i-2,j-2) * (-0.025d0) & + + in(i+1,j-2) * (-0.008333333333333333d0) & + + in(i+2,j-2) * (-0.008333333333333333d0) & + + in(i+3,j-2) * (-0.008333333333333333d0) & + + in(i+4,j-2) * (-0.008333333333333333d0) & + + in(i+5,j-2) * (-0.008333333333333333d0) & + + in(i-1,j-1) * (-0.05d0) & + + in(i+1,j-1) * (-0.05d0) & + + in(i+2,j-1) * (-0.05d0) & + + in(i+3,j-1) * (-0.05d0) & + + in(i+4,j-1) * (-0.05d0) & + + in(i+5,j-1) * (-0.05d0) & + + in(i-5,j+1) * (-0.0011111111111111111d0) & + + in(i-4,j+1) * (-0.0017857142857142857d0) & + + in(i-3,j+1) * (-0.0033333333333333335d0) & + + in(i-2,j+1) * (-0.008333333333333333d0) & + + in(i-1,j+1) * (-0.05d0) & + + in(i+1,j+1) * (0.05d0) & + + in(i+2,j+1) * (0.008333333333333333d0) & + + in(i+3,j+1) * (0.0033333333333333335d0) & + + in(i+4,j+1) * (0.0017857142857142857d0) & + + in(i+5,j+1) * (0.0011111111111111111d0) & + + in(i-5,j+2) * (-0.0011111111111111111d0) & + + in(i-4,j+2) * (-0.0017857142857142857d0) & + + in(i-3,j+2) * (-0.0033333333333333335d0) & + + in(i-2,j+2) * (-0.008333333333333333d0) & + + in(i-1,j+2) * (-0.05d0) & + + in(i+1,j+2) * (0.008333333333333333d0) & + + in(i+2,j+2) * (0.025d0) & + + in(i+3,j+2) * (0.0033333333333333335d0) & + + in(i+4,j+2) * (0.0017857142857142857d0) & + + in(i+5,j+2) * (0.0011111111111111111d0) & + + in(i-5,j+3) * (-0.0011111111111111111d0) & + + in(i-4,j+3) * (-0.0017857142857142857d0) & + + in(i-3,j+3) * (-0.0033333333333333335d0) & + + in(i-2,j+3) * (-0.008333333333333333d0) & + + in(i-1,j+3) * (-0.05d0) & + + in(i+1,j+3) * (0.0033333333333333335d0) & + + in(i+2,j+3) * (0.0033333333333333335d0) & + + in(i+3,j+3) * (0.016666666666666666d0) & + + in(i+4,j+3) * (0.0017857142857142857d0) & + + in(i+5,j+3) * (0.0011111111111111111d0) & + + in(i-5,j+4) * (-0.0011111111111111111d0) & + + in(i-4,j+4) * (-0.0017857142857142857d0) & + + in(i-3,j+4) * (-0.0033333333333333335d0) & + + in(i-2,j+4) * (-0.008333333333333333d0) & + + in(i-1,j+4) * (-0.05d0) & + + in(i+1,j+4) * (0.0017857142857142857d0) & + + in(i+2,j+4) * (0.0017857142857142857d0) & + + in(i+3,j+4) * (0.0017857142857142857d0) & + + in(i+4,j+4) * (0.0125d0) & + + in(i+5,j+4) * (0.0011111111111111111d0) & + + in(i-5,j+5) * (-0.0011111111111111111d0) & + + in(i-4,j+5) * (-0.0017857142857142857d0) & + + in(i-3,j+5) * (-0.0033333333333333335d0) & + + in(i-2,j+5) * (-0.008333333333333333d0) & + + in(i-1,j+5) * (-0.05d0) & + + in(i+1,j+5) * (0.0011111111111111111d0) & + + in(i+2,j+5) * (0.0011111111111111111d0) & + + in(i+3,j+5) * (0.0011111111111111111d0) & + + in(i+4,j+5) * (0.0011111111111111111d0) & + + in(i+5,j+5) * (0.01d0) & +0.0 end do !$omp end simd @@ -636,120 +636,120 @@ subroutine grid6(n, in, out) !$omp simd do j=6,n-6-1 out(i,j) = out(i,j) & - + in(i-6,j-6) * (-0.006944444444444444) & - + in(i+1,j-6) * (-0.0006313131313131314) & - + in(i+2,j-6) * (-0.0006313131313131314) & - + in(i+3,j-6) * (-0.0006313131313131314) & - + in(i+4,j-6) * (-0.0006313131313131314) & - + in(i+5,j-6) * (-0.0006313131313131314) & - + in(i+6,j-6) * (-0.0006313131313131314) & - + in(i-5,j-5) * (-0.008333333333333333) & - + in(i+1,j-5) * (-0.000925925925925926) & - + in(i+2,j-5) * (-0.000925925925925926) & - + in(i+3,j-5) * (-0.000925925925925926) & - + in(i+4,j-5) * (-0.000925925925925926) & - + in(i+5,j-5) * (-0.000925925925925926) & - + in(i+6,j-5) * (-0.000925925925925926) & - + in(i-4,j-4) * (-0.010416666666666666) & - + in(i+1,j-4) * (-0.001488095238095238) & - + in(i+2,j-4) * (-0.001488095238095238) & - + in(i+3,j-4) * (-0.001488095238095238) & - + in(i+4,j-4) * (-0.001488095238095238) & - + in(i+5,j-4) * (-0.001488095238095238) & - + in(i+6,j-4) * (-0.001488095238095238) & - + in(i-3,j-3) * (-0.013888888888888888) & - + in(i+1,j-3) * (-0.002777777777777778) & - + in(i+2,j-3) * (-0.002777777777777778) & - + in(i+3,j-3) * (-0.002777777777777778) & - + in(i+4,j-3) * (-0.002777777777777778) & - + in(i+5,j-3) * (-0.002777777777777778) & - + in(i+6,j-3) * (-0.002777777777777778) & - + in(i-2,j-2) * (-0.020833333333333332) & - + in(i+1,j-2) * (-0.006944444444444444) & - + in(i+2,j-2) * (-0.006944444444444444) & - + in(i+3,j-2) * (-0.006944444444444444) & - + in(i+4,j-2) * (-0.006944444444444444) & - + in(i+5,j-2) * (-0.006944444444444444) & - + in(i+6,j-2) * (-0.006944444444444444) & - + in(i-1,j-1) * (-0.041666666666666664) & - + in(i+1,j-1) * (-0.041666666666666664) & - + in(i+2,j-1) * (-0.041666666666666664) & - + in(i+3,j-1) * (-0.041666666666666664) & - + in(i+4,j-1) * (-0.041666666666666664) & - + in(i+5,j-1) * (-0.041666666666666664) & - + in(i+6,j-1) * (-0.041666666666666664) & - + in(i-6,j+1) * (-0.0006313131313131314) & - + in(i-5,j+1) * (-0.000925925925925926) & - + in(i-4,j+1) * (-0.001488095238095238) & - + in(i-3,j+1) * (-0.002777777777777778) & - + in(i-2,j+1) * (-0.006944444444444444) & - + in(i-1,j+1) * (-0.041666666666666664) & - + in(i+1,j+1) * (0.041666666666666664) & - + in(i+2,j+1) * (0.006944444444444444) & - + in(i+3,j+1) * (0.002777777777777778) & - + in(i+4,j+1) * (0.001488095238095238) & - + in(i+5,j+1) * (0.000925925925925926) & - + in(i+6,j+1) * (0.0006313131313131314) & - + in(i-6,j+2) * (-0.0006313131313131314) & - + in(i-5,j+2) * (-0.000925925925925926) & - + in(i-4,j+2) * (-0.001488095238095238) & - + in(i-3,j+2) * (-0.002777777777777778) & - + in(i-2,j+2) * (-0.006944444444444444) & - + in(i-1,j+2) * (-0.041666666666666664) & - + in(i+1,j+2) * (0.006944444444444444) & - + in(i+2,j+2) * (0.020833333333333332) & - + in(i+3,j+2) * (0.002777777777777778) & - + in(i+4,j+2) * (0.001488095238095238) & - + in(i+5,j+2) * (0.000925925925925926) & - + in(i+6,j+2) * (0.0006313131313131314) & - + in(i-6,j+3) * (-0.0006313131313131314) & - + in(i-5,j+3) * (-0.000925925925925926) & - + in(i-4,j+3) * (-0.001488095238095238) & - + in(i-3,j+3) * (-0.002777777777777778) & - + in(i-2,j+3) * (-0.006944444444444444) & - + in(i-1,j+3) * (-0.041666666666666664) & - + in(i+1,j+3) * (0.002777777777777778) & - + in(i+2,j+3) * (0.002777777777777778) & - + in(i+3,j+3) * (0.013888888888888888) & - + in(i+4,j+3) * (0.001488095238095238) & - + in(i+5,j+3) * (0.000925925925925926) & - + in(i+6,j+3) * (0.0006313131313131314) & - + in(i-6,j+4) * (-0.0006313131313131314) & - + in(i-5,j+4) * (-0.000925925925925926) & - + in(i-4,j+4) * (-0.001488095238095238) & - + in(i-3,j+4) * (-0.002777777777777778) & - + in(i-2,j+4) * (-0.006944444444444444) & - + in(i-1,j+4) * (-0.041666666666666664) & - + in(i+1,j+4) * (0.001488095238095238) & - + in(i+2,j+4) * (0.001488095238095238) & - + in(i+3,j+4) * (0.001488095238095238) & - + in(i+4,j+4) * (0.010416666666666666) & - + in(i+5,j+4) * (0.000925925925925926) & - + in(i+6,j+4) * (0.0006313131313131314) & - + in(i-6,j+5) * (-0.0006313131313131314) & - + in(i-5,j+5) * (-0.000925925925925926) & - + in(i-4,j+5) * (-0.001488095238095238) & - + in(i-3,j+5) * (-0.002777777777777778) & - + in(i-2,j+5) * (-0.006944444444444444) & - + in(i-1,j+5) * (-0.041666666666666664) & - + in(i+1,j+5) * (0.000925925925925926) & - + in(i+2,j+5) * (0.000925925925925926) & - + in(i+3,j+5) * (0.000925925925925926) & - + in(i+4,j+5) * (0.000925925925925926) & - + in(i+5,j+5) * (0.008333333333333333) & - + in(i+6,j+5) * (0.0006313131313131314) & - + in(i-6,j+6) * (-0.0006313131313131314) & - + in(i-5,j+6) * (-0.000925925925925926) & - + in(i-4,j+6) * (-0.001488095238095238) & - + in(i-3,j+6) * (-0.002777777777777778) & - + in(i-2,j+6) * (-0.006944444444444444) & - + in(i-1,j+6) * (-0.041666666666666664) & - + in(i+1,j+6) * (0.0006313131313131314) & - + in(i+2,j+6) * (0.0006313131313131314) & - + in(i+3,j+6) * (0.0006313131313131314) & - + in(i+4,j+6) * (0.0006313131313131314) & - + in(i+5,j+6) * (0.0006313131313131314) & - + in(i+6,j+6) * (0.006944444444444444) & + + in(i-6,j-6) * (-0.006944444444444444d0) & + + in(i+1,j-6) * (-0.0006313131313131314d0) & + + in(i+2,j-6) * (-0.0006313131313131314d0) & + + in(i+3,j-6) * (-0.0006313131313131314d0) & + + in(i+4,j-6) * (-0.0006313131313131314d0) & + + in(i+5,j-6) * (-0.0006313131313131314d0) & + + in(i+6,j-6) * (-0.0006313131313131314d0) & + + in(i-5,j-5) * (-0.008333333333333333d0) & + + in(i+1,j-5) * (-0.000925925925925926d0) & + + in(i+2,j-5) * (-0.000925925925925926d0) & + + in(i+3,j-5) * (-0.000925925925925926d0) & + + in(i+4,j-5) * (-0.000925925925925926d0) & + + in(i+5,j-5) * (-0.000925925925925926d0) & + + in(i+6,j-5) * (-0.000925925925925926d0) & + + in(i-4,j-4) * (-0.010416666666666666d0) & + + in(i+1,j-4) * (-0.001488095238095238d0) & + + in(i+2,j-4) * (-0.001488095238095238d0) & + + in(i+3,j-4) * (-0.001488095238095238d0) & + + in(i+4,j-4) * (-0.001488095238095238d0) & + + in(i+5,j-4) * (-0.001488095238095238d0) & + + in(i+6,j-4) * (-0.001488095238095238d0) & + + in(i-3,j-3) * (-0.013888888888888888d0) & + + in(i+1,j-3) * (-0.002777777777777778d0) & + + in(i+2,j-3) * (-0.002777777777777778d0) & + + in(i+3,j-3) * (-0.002777777777777778d0) & + + in(i+4,j-3) * (-0.002777777777777778d0) & + + in(i+5,j-3) * (-0.002777777777777778d0) & + + in(i+6,j-3) * (-0.002777777777777778d0) & + + in(i-2,j-2) * (-0.020833333333333332d0) & + + in(i+1,j-2) * (-0.006944444444444444d0) & + + in(i+2,j-2) * (-0.006944444444444444d0) & + + in(i+3,j-2) * (-0.006944444444444444d0) & + + in(i+4,j-2) * (-0.006944444444444444d0) & + + in(i+5,j-2) * (-0.006944444444444444d0) & + + in(i+6,j-2) * (-0.006944444444444444d0) & + + in(i-1,j-1) * (-0.041666666666666664d0) & + + in(i+1,j-1) * (-0.041666666666666664d0) & + + in(i+2,j-1) * (-0.041666666666666664d0) & + + in(i+3,j-1) * (-0.041666666666666664d0) & + + in(i+4,j-1) * (-0.041666666666666664d0) & + + in(i+5,j-1) * (-0.041666666666666664d0) & + + in(i+6,j-1) * (-0.041666666666666664d0) & + + in(i-6,j+1) * (-0.0006313131313131314d0) & + + in(i-5,j+1) * (-0.000925925925925926d0) & + + in(i-4,j+1) * (-0.001488095238095238d0) & + + in(i-3,j+1) * (-0.002777777777777778d0) & + + in(i-2,j+1) * (-0.006944444444444444d0) & + + in(i-1,j+1) * (-0.041666666666666664d0) & + + in(i+1,j+1) * (0.041666666666666664d0) & + + in(i+2,j+1) * (0.006944444444444444d0) & + + in(i+3,j+1) * (0.002777777777777778d0) & + + in(i+4,j+1) * (0.001488095238095238d0) & + + in(i+5,j+1) * (0.000925925925925926d0) & + + in(i+6,j+1) * (0.0006313131313131314d0) & + + in(i-6,j+2) * (-0.0006313131313131314d0) & + + in(i-5,j+2) * (-0.000925925925925926d0) & + + in(i-4,j+2) * (-0.001488095238095238d0) & + + in(i-3,j+2) * (-0.002777777777777778d0) & + + in(i-2,j+2) * (-0.006944444444444444d0) & + + in(i-1,j+2) * (-0.041666666666666664d0) & + + in(i+1,j+2) * (0.006944444444444444d0) & + + in(i+2,j+2) * (0.020833333333333332d0) & + + in(i+3,j+2) * (0.002777777777777778d0) & + + in(i+4,j+2) * (0.001488095238095238d0) & + + in(i+5,j+2) * (0.000925925925925926d0) & + + in(i+6,j+2) * (0.0006313131313131314d0) & + + in(i-6,j+3) * (-0.0006313131313131314d0) & + + in(i-5,j+3) * (-0.000925925925925926d0) & + + in(i-4,j+3) * (-0.001488095238095238d0) & + + in(i-3,j+3) * (-0.002777777777777778d0) & + + in(i-2,j+3) * (-0.006944444444444444d0) & + + in(i-1,j+3) * (-0.041666666666666664d0) & + + in(i+1,j+3) * (0.002777777777777778d0) & + + in(i+2,j+3) * (0.002777777777777778d0) & + + in(i+3,j+3) * (0.013888888888888888d0) & + + in(i+4,j+3) * (0.001488095238095238d0) & + + in(i+5,j+3) * (0.000925925925925926d0) & + + in(i+6,j+3) * (0.0006313131313131314d0) & + + in(i-6,j+4) * (-0.0006313131313131314d0) & + + in(i-5,j+4) * (-0.000925925925925926d0) & + + in(i-4,j+4) * (-0.001488095238095238d0) & + + in(i-3,j+4) * (-0.002777777777777778d0) & + + in(i-2,j+4) * (-0.006944444444444444d0) & + + in(i-1,j+4) * (-0.041666666666666664d0) & + + in(i+1,j+4) * (0.001488095238095238d0) & + + in(i+2,j+4) * (0.001488095238095238d0) & + + in(i+3,j+4) * (0.001488095238095238d0) & + + in(i+4,j+4) * (0.010416666666666666d0) & + + in(i+5,j+4) * (0.000925925925925926d0) & + + in(i+6,j+4) * (0.0006313131313131314d0) & + + in(i-6,j+5) * (-0.0006313131313131314d0) & + + in(i-5,j+5) * (-0.000925925925925926d0) & + + in(i-4,j+5) * (-0.001488095238095238d0) & + + in(i-3,j+5) * (-0.002777777777777778d0) & + + in(i-2,j+5) * (-0.006944444444444444d0) & + + in(i-1,j+5) * (-0.041666666666666664d0) & + + in(i+1,j+5) * (0.000925925925925926d0) & + + in(i+2,j+5) * (0.000925925925925926d0) & + + in(i+3,j+5) * (0.000925925925925926d0) & + + in(i+4,j+5) * (0.000925925925925926d0) & + + in(i+5,j+5) * (0.008333333333333333d0) & + + in(i+6,j+5) * (0.0006313131313131314d0) & + + in(i-6,j+6) * (-0.0006313131313131314d0) & + + in(i-5,j+6) * (-0.000925925925925926d0) & + + in(i-4,j+6) * (-0.001488095238095238d0) & + + in(i-3,j+6) * (-0.002777777777777778d0) & + + in(i-2,j+6) * (-0.006944444444444444d0) & + + in(i-1,j+6) * (-0.041666666666666664d0) & + + in(i+1,j+6) * (0.0006313131313131314d0) & + + in(i+2,j+6) * (0.0006313131313131314d0) & + + in(i+3,j+6) * (0.0006313131313131314d0) & + + in(i+4,j+6) * (0.0006313131313131314d0) & + + in(i+5,j+6) * (0.0006313131313131314d0) & + + in(i+6,j+6) * (0.006944444444444444d0) & +0.0 end do !$omp end simd @@ -769,160 +769,160 @@ subroutine grid7(n, in, out) !$omp simd do j=7,n-7-1 out(i,j) = out(i,j) & - + in(i-7,j-7) * (-0.00510204081632653) & - + in(i+1,j-7) * (-0.0003924646781789639) & - + in(i+2,j-7) * (-0.0003924646781789639) & - + in(i+3,j-7) * (-0.0003924646781789639) & - + in(i+4,j-7) * (-0.0003924646781789639) & - + in(i+5,j-7) * (-0.0003924646781789639) & - + in(i+6,j-7) * (-0.0003924646781789639) & - + in(i+7,j-7) * (-0.0003924646781789639) & - + in(i-6,j-6) * (-0.005952380952380952) & - + in(i+1,j-6) * (-0.0005411255411255411) & - + in(i+2,j-6) * (-0.0005411255411255411) & - + in(i+3,j-6) * (-0.0005411255411255411) & - + in(i+4,j-6) * (-0.0005411255411255411) & - + in(i+5,j-6) * (-0.0005411255411255411) & - + in(i+6,j-6) * (-0.0005411255411255411) & - + in(i+7,j-6) * (-0.0005411255411255411) & - + in(i-5,j-5) * (-0.007142857142857143) & - + in(i+1,j-5) * (-0.0007936507936507937) & - + in(i+2,j-5) * (-0.0007936507936507937) & - + in(i+3,j-5) * (-0.0007936507936507937) & - + in(i+4,j-5) * (-0.0007936507936507937) & - + in(i+5,j-5) * (-0.0007936507936507937) & - + in(i+6,j-5) * (-0.0007936507936507937) & - + in(i+7,j-5) * (-0.0007936507936507937) & - + in(i-4,j-4) * (-0.008928571428571428) & - + in(i+1,j-4) * (-0.0012755102040816326) & - + in(i+2,j-4) * (-0.0012755102040816326) & - + in(i+3,j-4) * (-0.0012755102040816326) & - + in(i+4,j-4) * (-0.0012755102040816326) & - + in(i+5,j-4) * (-0.0012755102040816326) & - + in(i+6,j-4) * (-0.0012755102040816326) & - + in(i+7,j-4) * (-0.0012755102040816326) & - + in(i-3,j-3) * (-0.011904761904761904) & - + in(i+1,j-3) * (-0.002380952380952381) & - + in(i+2,j-3) * (-0.002380952380952381) & - + in(i+3,j-3) * (-0.002380952380952381) & - + in(i+4,j-3) * (-0.002380952380952381) & - + in(i+5,j-3) * (-0.002380952380952381) & - + in(i+6,j-3) * (-0.002380952380952381) & - + in(i+7,j-3) * (-0.002380952380952381) & - + in(i-2,j-2) * (-0.017857142857142856) & - + in(i+1,j-2) * (-0.005952380952380952) & - + in(i+2,j-2) * (-0.005952380952380952) & - + in(i+3,j-2) * (-0.005952380952380952) & - + in(i+4,j-2) * (-0.005952380952380952) & - + in(i+5,j-2) * (-0.005952380952380952) & - + in(i+6,j-2) * (-0.005952380952380952) & - + in(i+7,j-2) * (-0.005952380952380952) & - + in(i-1,j-1) * (-0.03571428571428571) & - + in(i+1,j-1) * (-0.03571428571428571) & - + in(i+2,j-1) * (-0.03571428571428571) & - + in(i+3,j-1) * (-0.03571428571428571) & - + in(i+4,j-1) * (-0.03571428571428571) & - + in(i+5,j-1) * (-0.03571428571428571) & - + in(i+6,j-1) * (-0.03571428571428571) & - + in(i+7,j-1) * (-0.03571428571428571) & - + in(i-7,j+1) * (-0.0003924646781789639) & - + in(i-6,j+1) * (-0.0005411255411255411) & - + in(i-5,j+1) * (-0.0007936507936507937) & - + in(i-4,j+1) * (-0.0012755102040816326) & - + in(i-3,j+1) * (-0.002380952380952381) & - + in(i-2,j+1) * (-0.005952380952380952) & - + in(i-1,j+1) * (-0.03571428571428571) & - + in(i+1,j+1) * (0.03571428571428571) & - + in(i+2,j+1) * (0.005952380952380952) & - + in(i+3,j+1) * (0.002380952380952381) & - + in(i+4,j+1) * (0.0012755102040816326) & - + in(i+5,j+1) * (0.0007936507936507937) & - + in(i+6,j+1) * (0.0005411255411255411) & - + in(i+7,j+1) * (0.0003924646781789639) & - + in(i-7,j+2) * (-0.0003924646781789639) & - + in(i-6,j+2) * (-0.0005411255411255411) & - + in(i-5,j+2) * (-0.0007936507936507937) & - + in(i-4,j+2) * (-0.0012755102040816326) & - + in(i-3,j+2) * (-0.002380952380952381) & - + in(i-2,j+2) * (-0.005952380952380952) & - + in(i-1,j+2) * (-0.03571428571428571) & - + in(i+1,j+2) * (0.005952380952380952) & - + in(i+2,j+2) * (0.017857142857142856) & - + in(i+3,j+2) * (0.002380952380952381) & - + in(i+4,j+2) * (0.0012755102040816326) & - + in(i+5,j+2) * (0.0007936507936507937) & - + in(i+6,j+2) * (0.0005411255411255411) & - + in(i+7,j+2) * (0.0003924646781789639) & - + in(i-7,j+3) * (-0.0003924646781789639) & - + in(i-6,j+3) * (-0.0005411255411255411) & - + in(i-5,j+3) * (-0.0007936507936507937) & - + in(i-4,j+3) * (-0.0012755102040816326) & - + in(i-3,j+3) * (-0.002380952380952381) & - + in(i-2,j+3) * (-0.005952380952380952) & - + in(i-1,j+3) * (-0.03571428571428571) & - + in(i+1,j+3) * (0.002380952380952381) & - + in(i+2,j+3) * (0.002380952380952381) & - + in(i+3,j+3) * (0.011904761904761904) & - + in(i+4,j+3) * (0.0012755102040816326) & - + in(i+5,j+3) * (0.0007936507936507937) & - + in(i+6,j+3) * (0.0005411255411255411) & - + in(i+7,j+3) * (0.0003924646781789639) & - + in(i-7,j+4) * (-0.0003924646781789639) & - + in(i-6,j+4) * (-0.0005411255411255411) & - + in(i-5,j+4) * (-0.0007936507936507937) & - + in(i-4,j+4) * (-0.0012755102040816326) & - + in(i-3,j+4) * (-0.002380952380952381) & - + in(i-2,j+4) * (-0.005952380952380952) & - + in(i-1,j+4) * (-0.03571428571428571) & - + in(i+1,j+4) * (0.0012755102040816326) & - + in(i+2,j+4) * (0.0012755102040816326) & - + in(i+3,j+4) * (0.0012755102040816326) & - + in(i+4,j+4) * (0.008928571428571428) & - + in(i+5,j+4) * (0.0007936507936507937) & - + in(i+6,j+4) * (0.0005411255411255411) & - + in(i+7,j+4) * (0.0003924646781789639) & - + in(i-7,j+5) * (-0.0003924646781789639) & - + in(i-6,j+5) * (-0.0005411255411255411) & - + in(i-5,j+5) * (-0.0007936507936507937) & - + in(i-4,j+5) * (-0.0012755102040816326) & - + in(i-3,j+5) * (-0.002380952380952381) & - + in(i-2,j+5) * (-0.005952380952380952) & - + in(i-1,j+5) * (-0.03571428571428571) & - + in(i+1,j+5) * (0.0007936507936507937) & - + in(i+2,j+5) * (0.0007936507936507937) & - + in(i+3,j+5) * (0.0007936507936507937) & - + in(i+4,j+5) * (0.0007936507936507937) & - + in(i+5,j+5) * (0.007142857142857143) & - + in(i+6,j+5) * (0.0005411255411255411) & - + in(i+7,j+5) * (0.0003924646781789639) & - + in(i-7,j+6) * (-0.0003924646781789639) & - + in(i-6,j+6) * (-0.0005411255411255411) & - + in(i-5,j+6) * (-0.0007936507936507937) & - + in(i-4,j+6) * (-0.0012755102040816326) & - + in(i-3,j+6) * (-0.002380952380952381) & - + in(i-2,j+6) * (-0.005952380952380952) & - + in(i-1,j+6) * (-0.03571428571428571) & - + in(i+1,j+6) * (0.0005411255411255411) & - + in(i+2,j+6) * (0.0005411255411255411) & - + in(i+3,j+6) * (0.0005411255411255411) & - + in(i+4,j+6) * (0.0005411255411255411) & - + in(i+5,j+6) * (0.0005411255411255411) & - + in(i+6,j+6) * (0.005952380952380952) & - + in(i+7,j+6) * (0.0003924646781789639) & - + in(i-7,j+7) * (-0.0003924646781789639) & - + in(i-6,j+7) * (-0.0005411255411255411) & - + in(i-5,j+7) * (-0.0007936507936507937) & - + in(i-4,j+7) * (-0.0012755102040816326) & - + in(i-3,j+7) * (-0.002380952380952381) & - + in(i-2,j+7) * (-0.005952380952380952) & - + in(i-1,j+7) * (-0.03571428571428571) & - + in(i+1,j+7) * (0.0003924646781789639) & - + in(i+2,j+7) * (0.0003924646781789639) & - + in(i+3,j+7) * (0.0003924646781789639) & - + in(i+4,j+7) * (0.0003924646781789639) & - + in(i+5,j+7) * (0.0003924646781789639) & - + in(i+6,j+7) * (0.0003924646781789639) & - + in(i+7,j+7) * (0.00510204081632653) & + + in(i-7,j-7) * (-0.00510204081632653d0) & + + in(i+1,j-7) * (-0.0003924646781789639d0) & + + in(i+2,j-7) * (-0.0003924646781789639d0) & + + in(i+3,j-7) * (-0.0003924646781789639d0) & + + in(i+4,j-7) * (-0.0003924646781789639d0) & + + in(i+5,j-7) * (-0.0003924646781789639d0) & + + in(i+6,j-7) * (-0.0003924646781789639d0) & + + in(i+7,j-7) * (-0.0003924646781789639d0) & + + in(i-6,j-6) * (-0.005952380952380952d0) & + + in(i+1,j-6) * (-0.0005411255411255411d0) & + + in(i+2,j-6) * (-0.0005411255411255411d0) & + + in(i+3,j-6) * (-0.0005411255411255411d0) & + + in(i+4,j-6) * (-0.0005411255411255411d0) & + + in(i+5,j-6) * (-0.0005411255411255411d0) & + + in(i+6,j-6) * (-0.0005411255411255411d0) & + + in(i+7,j-6) * (-0.0005411255411255411d0) & + + in(i-5,j-5) * (-0.007142857142857143d0) & + + in(i+1,j-5) * (-0.0007936507936507937d0) & + + in(i+2,j-5) * (-0.0007936507936507937d0) & + + in(i+3,j-5) * (-0.0007936507936507937d0) & + + in(i+4,j-5) * (-0.0007936507936507937d0) & + + in(i+5,j-5) * (-0.0007936507936507937d0) & + + in(i+6,j-5) * (-0.0007936507936507937d0) & + + in(i+7,j-5) * (-0.0007936507936507937d0) & + + in(i-4,j-4) * (-0.008928571428571428d0) & + + in(i+1,j-4) * (-0.0012755102040816326d0) & + + in(i+2,j-4) * (-0.0012755102040816326d0) & + + in(i+3,j-4) * (-0.0012755102040816326d0) & + + in(i+4,j-4) * (-0.0012755102040816326d0) & + + in(i+5,j-4) * (-0.0012755102040816326d0) & + + in(i+6,j-4) * (-0.0012755102040816326d0) & + + in(i+7,j-4) * (-0.0012755102040816326d0) & + + in(i-3,j-3) * (-0.011904761904761904d0) & + + in(i+1,j-3) * (-0.002380952380952381d0) & + + in(i+2,j-3) * (-0.002380952380952381d0) & + + in(i+3,j-3) * (-0.002380952380952381d0) & + + in(i+4,j-3) * (-0.002380952380952381d0) & + + in(i+5,j-3) * (-0.002380952380952381d0) & + + in(i+6,j-3) * (-0.002380952380952381d0) & + + in(i+7,j-3) * (-0.002380952380952381d0) & + + in(i-2,j-2) * (-0.017857142857142856d0) & + + in(i+1,j-2) * (-0.005952380952380952d0) & + + in(i+2,j-2) * (-0.005952380952380952d0) & + + in(i+3,j-2) * (-0.005952380952380952d0) & + + in(i+4,j-2) * (-0.005952380952380952d0) & + + in(i+5,j-2) * (-0.005952380952380952d0) & + + in(i+6,j-2) * (-0.005952380952380952d0) & + + in(i+7,j-2) * (-0.005952380952380952d0) & + + in(i-1,j-1) * (-0.03571428571428571d0) & + + in(i+1,j-1) * (-0.03571428571428571d0) & + + in(i+2,j-1) * (-0.03571428571428571d0) & + + in(i+3,j-1) * (-0.03571428571428571d0) & + + in(i+4,j-1) * (-0.03571428571428571d0) & + + in(i+5,j-1) * (-0.03571428571428571d0) & + + in(i+6,j-1) * (-0.03571428571428571d0) & + + in(i+7,j-1) * (-0.03571428571428571d0) & + + in(i-7,j+1) * (-0.0003924646781789639d0) & + + in(i-6,j+1) * (-0.0005411255411255411d0) & + + in(i-5,j+1) * (-0.0007936507936507937d0) & + + in(i-4,j+1) * (-0.0012755102040816326d0) & + + in(i-3,j+1) * (-0.002380952380952381d0) & + + in(i-2,j+1) * (-0.005952380952380952d0) & + + in(i-1,j+1) * (-0.03571428571428571d0) & + + in(i+1,j+1) * (0.03571428571428571d0) & + + in(i+2,j+1) * (0.005952380952380952d0) & + + in(i+3,j+1) * (0.002380952380952381d0) & + + in(i+4,j+1) * (0.0012755102040816326d0) & + + in(i+5,j+1) * (0.0007936507936507937d0) & + + in(i+6,j+1) * (0.0005411255411255411d0) & + + in(i+7,j+1) * (0.0003924646781789639d0) & + + in(i-7,j+2) * (-0.0003924646781789639d0) & + + in(i-6,j+2) * (-0.0005411255411255411d0) & + + in(i-5,j+2) * (-0.0007936507936507937d0) & + + in(i-4,j+2) * (-0.0012755102040816326d0) & + + in(i-3,j+2) * (-0.002380952380952381d0) & + + in(i-2,j+2) * (-0.005952380952380952d0) & + + in(i-1,j+2) * (-0.03571428571428571d0) & + + in(i+1,j+2) * (0.005952380952380952d0) & + + in(i+2,j+2) * (0.017857142857142856d0) & + + in(i+3,j+2) * (0.002380952380952381d0) & + + in(i+4,j+2) * (0.0012755102040816326d0) & + + in(i+5,j+2) * (0.0007936507936507937d0) & + + in(i+6,j+2) * (0.0005411255411255411d0) & + + in(i+7,j+2) * (0.0003924646781789639d0) & + + in(i-7,j+3) * (-0.0003924646781789639d0) & + + in(i-6,j+3) * (-0.0005411255411255411d0) & + + in(i-5,j+3) * (-0.0007936507936507937d0) & + + in(i-4,j+3) * (-0.0012755102040816326d0) & + + in(i-3,j+3) * (-0.002380952380952381d0) & + + in(i-2,j+3) * (-0.005952380952380952d0) & + + in(i-1,j+3) * (-0.03571428571428571d0) & + + in(i+1,j+3) * (0.002380952380952381d0) & + + in(i+2,j+3) * (0.002380952380952381d0) & + + in(i+3,j+3) * (0.011904761904761904d0) & + + in(i+4,j+3) * (0.0012755102040816326d0) & + + in(i+5,j+3) * (0.0007936507936507937d0) & + + in(i+6,j+3) * (0.0005411255411255411d0) & + + in(i+7,j+3) * (0.0003924646781789639d0) & + + in(i-7,j+4) * (-0.0003924646781789639d0) & + + in(i-6,j+4) * (-0.0005411255411255411d0) & + + in(i-5,j+4) * (-0.0007936507936507937d0) & + + in(i-4,j+4) * (-0.0012755102040816326d0) & + + in(i-3,j+4) * (-0.002380952380952381d0) & + + in(i-2,j+4) * (-0.005952380952380952d0) & + + in(i-1,j+4) * (-0.03571428571428571d0) & + + in(i+1,j+4) * (0.0012755102040816326d0) & + + in(i+2,j+4) * (0.0012755102040816326d0) & + + in(i+3,j+4) * (0.0012755102040816326d0) & + + in(i+4,j+4) * (0.008928571428571428d0) & + + in(i+5,j+4) * (0.0007936507936507937d0) & + + in(i+6,j+4) * (0.0005411255411255411d0) & + + in(i+7,j+4) * (0.0003924646781789639d0) & + + in(i-7,j+5) * (-0.0003924646781789639d0) & + + in(i-6,j+5) * (-0.0005411255411255411d0) & + + in(i-5,j+5) * (-0.0007936507936507937d0) & + + in(i-4,j+5) * (-0.0012755102040816326d0) & + + in(i-3,j+5) * (-0.002380952380952381d0) & + + in(i-2,j+5) * (-0.005952380952380952d0) & + + in(i-1,j+5) * (-0.03571428571428571d0) & + + in(i+1,j+5) * (0.0007936507936507937d0) & + + in(i+2,j+5) * (0.0007936507936507937d0) & + + in(i+3,j+5) * (0.0007936507936507937d0) & + + in(i+4,j+5) * (0.0007936507936507937d0) & + + in(i+5,j+5) * (0.007142857142857143d0) & + + in(i+6,j+5) * (0.0005411255411255411d0) & + + in(i+7,j+5) * (0.0003924646781789639d0) & + + in(i-7,j+6) * (-0.0003924646781789639d0) & + + in(i-6,j+6) * (-0.0005411255411255411d0) & + + in(i-5,j+6) * (-0.0007936507936507937d0) & + + in(i-4,j+6) * (-0.0012755102040816326d0) & + + in(i-3,j+6) * (-0.002380952380952381d0) & + + in(i-2,j+6) * (-0.005952380952380952d0) & + + in(i-1,j+6) * (-0.03571428571428571d0) & + + in(i+1,j+6) * (0.0005411255411255411d0) & + + in(i+2,j+6) * (0.0005411255411255411d0) & + + in(i+3,j+6) * (0.0005411255411255411d0) & + + in(i+4,j+6) * (0.0005411255411255411d0) & + + in(i+5,j+6) * (0.0005411255411255411d0) & + + in(i+6,j+6) * (0.005952380952380952d0) & + + in(i+7,j+6) * (0.0003924646781789639d0) & + + in(i-7,j+7) * (-0.0003924646781789639d0) & + + in(i-6,j+7) * (-0.0005411255411255411d0) & + + in(i-5,j+7) * (-0.0007936507936507937d0) & + + in(i-4,j+7) * (-0.0012755102040816326d0) & + + in(i-3,j+7) * (-0.002380952380952381d0) & + + in(i-2,j+7) * (-0.005952380952380952d0) & + + in(i-1,j+7) * (-0.03571428571428571d0) & + + in(i+1,j+7) * (0.0003924646781789639d0) & + + in(i+2,j+7) * (0.0003924646781789639d0) & + + in(i+3,j+7) * (0.0003924646781789639d0) & + + in(i+4,j+7) * (0.0003924646781789639d0) & + + in(i+5,j+7) * (0.0003924646781789639d0) & + + in(i+6,j+7) * (0.0003924646781789639d0) & + + in(i+7,j+7) * (0.00510204081632653d0) & +0.0 end do !$omp end simd @@ -942,206 +942,206 @@ subroutine grid8(n, in, out) !$omp simd do j=8,n-8-1 out(i,j) = out(i,j) & - + in(i-8,j-8) * (-0.00390625) & - + in(i+1,j-8) * (-0.00026041666666666666) & - + in(i+2,j-8) * (-0.00026041666666666666) & - + in(i+3,j-8) * (-0.00026041666666666666) & - + in(i+4,j-8) * (-0.00026041666666666666) & - + in(i+5,j-8) * (-0.00026041666666666666) & - + in(i+6,j-8) * (-0.00026041666666666666) & - + in(i+7,j-8) * (-0.00026041666666666666) & - + in(i+8,j-8) * (-0.00026041666666666666) & - + in(i-7,j-7) * (-0.004464285714285714) & - + in(i+1,j-7) * (-0.00034340659340659343) & - + in(i+2,j-7) * (-0.00034340659340659343) & - + in(i+3,j-7) * (-0.00034340659340659343) & - + in(i+4,j-7) * (-0.00034340659340659343) & - + in(i+5,j-7) * (-0.00034340659340659343) & - + in(i+6,j-7) * (-0.00034340659340659343) & - + in(i+7,j-7) * (-0.00034340659340659343) & - + in(i+8,j-7) * (-0.00034340659340659343) & - + in(i-6,j-6) * (-0.005208333333333333) & - + in(i+1,j-6) * (-0.0004734848484848485) & - + in(i+2,j-6) * (-0.0004734848484848485) & - + in(i+3,j-6) * (-0.0004734848484848485) & - + in(i+4,j-6) * (-0.0004734848484848485) & - + in(i+5,j-6) * (-0.0004734848484848485) & - + in(i+6,j-6) * (-0.0004734848484848485) & - + in(i+7,j-6) * (-0.0004734848484848485) & - + in(i+8,j-6) * (-0.0004734848484848485) & - + in(i-5,j-5) * (-0.00625) & - + in(i+1,j-5) * (-0.0006944444444444445) & - + in(i+2,j-5) * (-0.0006944444444444445) & - + in(i+3,j-5) * (-0.0006944444444444445) & - + in(i+4,j-5) * (-0.0006944444444444445) & - + in(i+5,j-5) * (-0.0006944444444444445) & - + in(i+6,j-5) * (-0.0006944444444444445) & - + in(i+7,j-5) * (-0.0006944444444444445) & - + in(i+8,j-5) * (-0.0006944444444444445) & - + in(i-4,j-4) * (-0.0078125) & - + in(i+1,j-4) * (-0.0011160714285714285) & - + in(i+2,j-4) * (-0.0011160714285714285) & - + in(i+3,j-4) * (-0.0011160714285714285) & - + in(i+4,j-4) * (-0.0011160714285714285) & - + in(i+5,j-4) * (-0.0011160714285714285) & - + in(i+6,j-4) * (-0.0011160714285714285) & - + in(i+7,j-4) * (-0.0011160714285714285) & - + in(i+8,j-4) * (-0.0011160714285714285) & - + in(i-3,j-3) * (-0.010416666666666666) & - + in(i+1,j-3) * (-0.0020833333333333333) & - + in(i+2,j-3) * (-0.0020833333333333333) & - + in(i+3,j-3) * (-0.0020833333333333333) & - + in(i+4,j-3) * (-0.0020833333333333333) & - + in(i+5,j-3) * (-0.0020833333333333333) & - + in(i+6,j-3) * (-0.0020833333333333333) & - + in(i+7,j-3) * (-0.0020833333333333333) & - + in(i+8,j-3) * (-0.0020833333333333333) & - + in(i-2,j-2) * (-0.015625) & - + in(i+1,j-2) * (-0.005208333333333333) & - + in(i+2,j-2) * (-0.005208333333333333) & - + in(i+3,j-2) * (-0.005208333333333333) & - + in(i+4,j-2) * (-0.005208333333333333) & - + in(i+5,j-2) * (-0.005208333333333333) & - + in(i+6,j-2) * (-0.005208333333333333) & - + in(i+7,j-2) * (-0.005208333333333333) & - + in(i+8,j-2) * (-0.005208333333333333) & - + in(i-1,j-1) * (-0.03125) & - + in(i+1,j-1) * (-0.03125) & - + in(i+2,j-1) * (-0.03125) & - + in(i+3,j-1) * (-0.03125) & - + in(i+4,j-1) * (-0.03125) & - + in(i+5,j-1) * (-0.03125) & - + in(i+6,j-1) * (-0.03125) & - + in(i+7,j-1) * (-0.03125) & - + in(i+8,j-1) * (-0.03125) & - + in(i-8,j+1) * (-0.00026041666666666666) & - + in(i-7,j+1) * (-0.00034340659340659343) & - + in(i-6,j+1) * (-0.0004734848484848485) & - + in(i-5,j+1) * (-0.0006944444444444445) & - + in(i-4,j+1) * (-0.0011160714285714285) & - + in(i-3,j+1) * (-0.0020833333333333333) & - + in(i-2,j+1) * (-0.005208333333333333) & - + in(i-1,j+1) * (-0.03125) & - + in(i+1,j+1) * (0.03125) & - + in(i+2,j+1) * (0.005208333333333333) & - + in(i+3,j+1) * (0.0020833333333333333) & - + in(i+4,j+1) * (0.0011160714285714285) & - + in(i+5,j+1) * (0.0006944444444444445) & - + in(i+6,j+1) * (0.0004734848484848485) & - + in(i+7,j+1) * (0.00034340659340659343) & - + in(i+8,j+1) * (0.00026041666666666666) & - + in(i-8,j+2) * (-0.00026041666666666666) & - + in(i-7,j+2) * (-0.00034340659340659343) & - + in(i-6,j+2) * (-0.0004734848484848485) & - + in(i-5,j+2) * (-0.0006944444444444445) & - + in(i-4,j+2) * (-0.0011160714285714285) & - + in(i-3,j+2) * (-0.0020833333333333333) & - + in(i-2,j+2) * (-0.005208333333333333) & - + in(i-1,j+2) * (-0.03125) & - + in(i+1,j+2) * (0.005208333333333333) & - + in(i+2,j+2) * (0.015625) & - + in(i+3,j+2) * (0.0020833333333333333) & - + in(i+4,j+2) * (0.0011160714285714285) & - + in(i+5,j+2) * (0.0006944444444444445) & - + in(i+6,j+2) * (0.0004734848484848485) & - + in(i+7,j+2) * (0.00034340659340659343) & - + in(i+8,j+2) * (0.00026041666666666666) & - + in(i-8,j+3) * (-0.00026041666666666666) & - + in(i-7,j+3) * (-0.00034340659340659343) & - + in(i-6,j+3) * (-0.0004734848484848485) & - + in(i-5,j+3) * (-0.0006944444444444445) & - + in(i-4,j+3) * (-0.0011160714285714285) & - + in(i-3,j+3) * (-0.0020833333333333333) & - + in(i-2,j+3) * (-0.005208333333333333) & - + in(i-1,j+3) * (-0.03125) & - + in(i+1,j+3) * (0.0020833333333333333) & - + in(i+2,j+3) * (0.0020833333333333333) & - + in(i+3,j+3) * (0.010416666666666666) & - + in(i+4,j+3) * (0.0011160714285714285) & - + in(i+5,j+3) * (0.0006944444444444445) & - + in(i+6,j+3) * (0.0004734848484848485) & - + in(i+7,j+3) * (0.00034340659340659343) & - + in(i+8,j+3) * (0.00026041666666666666) & - + in(i-8,j+4) * (-0.00026041666666666666) & - + in(i-7,j+4) * (-0.00034340659340659343) & - + in(i-6,j+4) * (-0.0004734848484848485) & - + in(i-5,j+4) * (-0.0006944444444444445) & - + in(i-4,j+4) * (-0.0011160714285714285) & - + in(i-3,j+4) * (-0.0020833333333333333) & - + in(i-2,j+4) * (-0.005208333333333333) & - + in(i-1,j+4) * (-0.03125) & - + in(i+1,j+4) * (0.0011160714285714285) & - + in(i+2,j+4) * (0.0011160714285714285) & - + in(i+3,j+4) * (0.0011160714285714285) & - + in(i+4,j+4) * (0.0078125) & - + in(i+5,j+4) * (0.0006944444444444445) & - + in(i+6,j+4) * (0.0004734848484848485) & - + in(i+7,j+4) * (0.00034340659340659343) & - + in(i+8,j+4) * (0.00026041666666666666) & - + in(i-8,j+5) * (-0.00026041666666666666) & - + in(i-7,j+5) * (-0.00034340659340659343) & - + in(i-6,j+5) * (-0.0004734848484848485) & - + in(i-5,j+5) * (-0.0006944444444444445) & - + in(i-4,j+5) * (-0.0011160714285714285) & - + in(i-3,j+5) * (-0.0020833333333333333) & - + in(i-2,j+5) * (-0.005208333333333333) & - + in(i-1,j+5) * (-0.03125) & - + in(i+1,j+5) * (0.0006944444444444445) & - + in(i+2,j+5) * (0.0006944444444444445) & - + in(i+3,j+5) * (0.0006944444444444445) & - + in(i+4,j+5) * (0.0006944444444444445) & - + in(i+5,j+5) * (0.00625) & - + in(i+6,j+5) * (0.0004734848484848485) & - + in(i+7,j+5) * (0.00034340659340659343) & - + in(i+8,j+5) * (0.00026041666666666666) & - + in(i-8,j+6) * (-0.00026041666666666666) & - + in(i-7,j+6) * (-0.00034340659340659343) & - + in(i-6,j+6) * (-0.0004734848484848485) & - + in(i-5,j+6) * (-0.0006944444444444445) & - + in(i-4,j+6) * (-0.0011160714285714285) & - + in(i-3,j+6) * (-0.0020833333333333333) & - + in(i-2,j+6) * (-0.005208333333333333) & - + in(i-1,j+6) * (-0.03125) & - + in(i+1,j+6) * (0.0004734848484848485) & - + in(i+2,j+6) * (0.0004734848484848485) & - + in(i+3,j+6) * (0.0004734848484848485) & - + in(i+4,j+6) * (0.0004734848484848485) & - + in(i+5,j+6) * (0.0004734848484848485) & - + in(i+6,j+6) * (0.005208333333333333) & - + in(i+7,j+6) * (0.00034340659340659343) & - + in(i+8,j+6) * (0.00026041666666666666) & - + in(i-8,j+7) * (-0.00026041666666666666) & - + in(i-7,j+7) * (-0.00034340659340659343) & - + in(i-6,j+7) * (-0.0004734848484848485) & - + in(i-5,j+7) * (-0.0006944444444444445) & - + in(i-4,j+7) * (-0.0011160714285714285) & - + in(i-3,j+7) * (-0.0020833333333333333) & - + in(i-2,j+7) * (-0.005208333333333333) & - + in(i-1,j+7) * (-0.03125) & - + in(i+1,j+7) * (0.00034340659340659343) & - + in(i+2,j+7) * (0.00034340659340659343) & - + in(i+3,j+7) * (0.00034340659340659343) & - + in(i+4,j+7) * (0.00034340659340659343) & - + in(i+5,j+7) * (0.00034340659340659343) & - + in(i+6,j+7) * (0.00034340659340659343) & - + in(i+7,j+7) * (0.004464285714285714) & - + in(i+8,j+7) * (0.00026041666666666666) & - + in(i-8,j+8) * (-0.00026041666666666666) & - + in(i-7,j+8) * (-0.00034340659340659343) & - + in(i-6,j+8) * (-0.0004734848484848485) & - + in(i-5,j+8) * (-0.0006944444444444445) & - + in(i-4,j+8) * (-0.0011160714285714285) & - + in(i-3,j+8) * (-0.0020833333333333333) & - + in(i-2,j+8) * (-0.005208333333333333) & - + in(i-1,j+8) * (-0.03125) & - + in(i+1,j+8) * (0.00026041666666666666) & - + in(i+2,j+8) * (0.00026041666666666666) & - + in(i+3,j+8) * (0.00026041666666666666) & - + in(i+4,j+8) * (0.00026041666666666666) & - + in(i+5,j+8) * (0.00026041666666666666) & - + in(i+6,j+8) * (0.00026041666666666666) & - + in(i+7,j+8) * (0.00026041666666666666) & - + in(i+8,j+8) * (0.00390625) & + + in(i-8,j-8) * (-0.00390625d0) & + + in(i+1,j-8) * (-0.00026041666666666666d0) & + + in(i+2,j-8) * (-0.00026041666666666666d0) & + + in(i+3,j-8) * (-0.00026041666666666666d0) & + + in(i+4,j-8) * (-0.00026041666666666666d0) & + + in(i+5,j-8) * (-0.00026041666666666666d0) & + + in(i+6,j-8) * (-0.00026041666666666666d0) & + + in(i+7,j-8) * (-0.00026041666666666666d0) & + + in(i+8,j-8) * (-0.00026041666666666666d0) & + + in(i-7,j-7) * (-0.004464285714285714d0) & + + in(i+1,j-7) * (-0.00034340659340659343d0) & + + in(i+2,j-7) * (-0.00034340659340659343d0) & + + in(i+3,j-7) * (-0.00034340659340659343d0) & + + in(i+4,j-7) * (-0.00034340659340659343d0) & + + in(i+5,j-7) * (-0.00034340659340659343d0) & + + in(i+6,j-7) * (-0.00034340659340659343d0) & + + in(i+7,j-7) * (-0.00034340659340659343d0) & + + in(i+8,j-7) * (-0.00034340659340659343d0) & + + in(i-6,j-6) * (-0.005208333333333333d0) & + + in(i+1,j-6) * (-0.0004734848484848485d0) & + + in(i+2,j-6) * (-0.0004734848484848485d0) & + + in(i+3,j-6) * (-0.0004734848484848485d0) & + + in(i+4,j-6) * (-0.0004734848484848485d0) & + + in(i+5,j-6) * (-0.0004734848484848485d0) & + + in(i+6,j-6) * (-0.0004734848484848485d0) & + + in(i+7,j-6) * (-0.0004734848484848485d0) & + + in(i+8,j-6) * (-0.0004734848484848485d0) & + + in(i-5,j-5) * (-0.00625d0) & + + in(i+1,j-5) * (-0.0006944444444444445d0) & + + in(i+2,j-5) * (-0.0006944444444444445d0) & + + in(i+3,j-5) * (-0.0006944444444444445d0) & + + in(i+4,j-5) * (-0.0006944444444444445d0) & + + in(i+5,j-5) * (-0.0006944444444444445d0) & + + in(i+6,j-5) * (-0.0006944444444444445d0) & + + in(i+7,j-5) * (-0.0006944444444444445d0) & + + in(i+8,j-5) * (-0.0006944444444444445d0) & + + in(i-4,j-4) * (-0.0078125d0) & + + in(i+1,j-4) * (-0.0011160714285714285d0) & + + in(i+2,j-4) * (-0.0011160714285714285d0) & + + in(i+3,j-4) * (-0.0011160714285714285d0) & + + in(i+4,j-4) * (-0.0011160714285714285d0) & + + in(i+5,j-4) * (-0.0011160714285714285d0) & + + in(i+6,j-4) * (-0.0011160714285714285d0) & + + in(i+7,j-4) * (-0.0011160714285714285d0) & + + in(i+8,j-4) * (-0.0011160714285714285d0) & + + in(i-3,j-3) * (-0.010416666666666666d0) & + + in(i+1,j-3) * (-0.0020833333333333333d0) & + + in(i+2,j-3) * (-0.0020833333333333333d0) & + + in(i+3,j-3) * (-0.0020833333333333333d0) & + + in(i+4,j-3) * (-0.0020833333333333333d0) & + + in(i+5,j-3) * (-0.0020833333333333333d0) & + + in(i+6,j-3) * (-0.0020833333333333333d0) & + + in(i+7,j-3) * (-0.0020833333333333333d0) & + + in(i+8,j-3) * (-0.0020833333333333333d0) & + + in(i-2,j-2) * (-0.015625d0) & + + in(i+1,j-2) * (-0.005208333333333333d0) & + + in(i+2,j-2) * (-0.005208333333333333d0) & + + in(i+3,j-2) * (-0.005208333333333333d0) & + + in(i+4,j-2) * (-0.005208333333333333d0) & + + in(i+5,j-2) * (-0.005208333333333333d0) & + + in(i+6,j-2) * (-0.005208333333333333d0) & + + in(i+7,j-2) * (-0.005208333333333333d0) & + + in(i+8,j-2) * (-0.005208333333333333d0) & + + in(i-1,j-1) * (-0.03125d0) & + + in(i+1,j-1) * (-0.03125d0) & + + in(i+2,j-1) * (-0.03125d0) & + + in(i+3,j-1) * (-0.03125d0) & + + in(i+4,j-1) * (-0.03125d0) & + + in(i+5,j-1) * (-0.03125d0) & + + in(i+6,j-1) * (-0.03125d0) & + + in(i+7,j-1) * (-0.03125d0) & + + in(i+8,j-1) * (-0.03125d0) & + + in(i-8,j+1) * (-0.00026041666666666666d0) & + + in(i-7,j+1) * (-0.00034340659340659343d0) & + + in(i-6,j+1) * (-0.0004734848484848485d0) & + + in(i-5,j+1) * (-0.0006944444444444445d0) & + + in(i-4,j+1) * (-0.0011160714285714285d0) & + + in(i-3,j+1) * (-0.0020833333333333333d0) & + + in(i-2,j+1) * (-0.005208333333333333d0) & + + in(i-1,j+1) * (-0.03125d0) & + + in(i+1,j+1) * (0.03125d0) & + + in(i+2,j+1) * (0.005208333333333333d0) & + + in(i+3,j+1) * (0.0020833333333333333d0) & + + in(i+4,j+1) * (0.0011160714285714285d0) & + + in(i+5,j+1) * (0.0006944444444444445d0) & + + in(i+6,j+1) * (0.0004734848484848485d0) & + + in(i+7,j+1) * (0.00034340659340659343d0) & + + in(i+8,j+1) * (0.00026041666666666666d0) & + + in(i-8,j+2) * (-0.00026041666666666666d0) & + + in(i-7,j+2) * (-0.00034340659340659343d0) & + + in(i-6,j+2) * (-0.0004734848484848485d0) & + + in(i-5,j+2) * (-0.0006944444444444445d0) & + + in(i-4,j+2) * (-0.0011160714285714285d0) & + + in(i-3,j+2) * (-0.0020833333333333333d0) & + + in(i-2,j+2) * (-0.005208333333333333d0) & + + in(i-1,j+2) * (-0.03125d0) & + + in(i+1,j+2) * (0.005208333333333333d0) & + + in(i+2,j+2) * (0.015625d0) & + + in(i+3,j+2) * (0.0020833333333333333d0) & + + in(i+4,j+2) * (0.0011160714285714285d0) & + + in(i+5,j+2) * (0.0006944444444444445d0) & + + in(i+6,j+2) * (0.0004734848484848485d0) & + + in(i+7,j+2) * (0.00034340659340659343d0) & + + in(i+8,j+2) * (0.00026041666666666666d0) & + + in(i-8,j+3) * (-0.00026041666666666666d0) & + + in(i-7,j+3) * (-0.00034340659340659343d0) & + + in(i-6,j+3) * (-0.0004734848484848485d0) & + + in(i-5,j+3) * (-0.0006944444444444445d0) & + + in(i-4,j+3) * (-0.0011160714285714285d0) & + + in(i-3,j+3) * (-0.0020833333333333333d0) & + + in(i-2,j+3) * (-0.005208333333333333d0) & + + in(i-1,j+3) * (-0.03125d0) & + + in(i+1,j+3) * (0.0020833333333333333d0) & + + in(i+2,j+3) * (0.0020833333333333333d0) & + + in(i+3,j+3) * (0.010416666666666666d0) & + + in(i+4,j+3) * (0.0011160714285714285d0) & + + in(i+5,j+3) * (0.0006944444444444445d0) & + + in(i+6,j+3) * (0.0004734848484848485d0) & + + in(i+7,j+3) * (0.00034340659340659343d0) & + + in(i+8,j+3) * (0.00026041666666666666d0) & + + in(i-8,j+4) * (-0.00026041666666666666d0) & + + in(i-7,j+4) * (-0.00034340659340659343d0) & + + in(i-6,j+4) * (-0.0004734848484848485d0) & + + in(i-5,j+4) * (-0.0006944444444444445d0) & + + in(i-4,j+4) * (-0.0011160714285714285d0) & + + in(i-3,j+4) * (-0.0020833333333333333d0) & + + in(i-2,j+4) * (-0.005208333333333333d0) & + + in(i-1,j+4) * (-0.03125d0) & + + in(i+1,j+4) * (0.0011160714285714285d0) & + + in(i+2,j+4) * (0.0011160714285714285d0) & + + in(i+3,j+4) * (0.0011160714285714285d0) & + + in(i+4,j+4) * (0.0078125d0) & + + in(i+5,j+4) * (0.0006944444444444445d0) & + + in(i+6,j+4) * (0.0004734848484848485d0) & + + in(i+7,j+4) * (0.00034340659340659343d0) & + + in(i+8,j+4) * (0.00026041666666666666d0) & + + in(i-8,j+5) * (-0.00026041666666666666d0) & + + in(i-7,j+5) * (-0.00034340659340659343d0) & + + in(i-6,j+5) * (-0.0004734848484848485d0) & + + in(i-5,j+5) * (-0.0006944444444444445d0) & + + in(i-4,j+5) * (-0.0011160714285714285d0) & + + in(i-3,j+5) * (-0.0020833333333333333d0) & + + in(i-2,j+5) * (-0.005208333333333333d0) & + + in(i-1,j+5) * (-0.03125d0) & + + in(i+1,j+5) * (0.0006944444444444445d0) & + + in(i+2,j+5) * (0.0006944444444444445d0) & + + in(i+3,j+5) * (0.0006944444444444445d0) & + + in(i+4,j+5) * (0.0006944444444444445d0) & + + in(i+5,j+5) * (0.00625d0) & + + in(i+6,j+5) * (0.0004734848484848485d0) & + + in(i+7,j+5) * (0.00034340659340659343d0) & + + in(i+8,j+5) * (0.00026041666666666666d0) & + + in(i-8,j+6) * (-0.00026041666666666666d0) & + + in(i-7,j+6) * (-0.00034340659340659343d0) & + + in(i-6,j+6) * (-0.0004734848484848485d0) & + + in(i-5,j+6) * (-0.0006944444444444445d0) & + + in(i-4,j+6) * (-0.0011160714285714285d0) & + + in(i-3,j+6) * (-0.0020833333333333333d0) & + + in(i-2,j+6) * (-0.005208333333333333d0) & + + in(i-1,j+6) * (-0.03125d0) & + + in(i+1,j+6) * (0.0004734848484848485d0) & + + in(i+2,j+6) * (0.0004734848484848485d0) & + + in(i+3,j+6) * (0.0004734848484848485d0) & + + in(i+4,j+6) * (0.0004734848484848485d0) & + + in(i+5,j+6) * (0.0004734848484848485d0) & + + in(i+6,j+6) * (0.005208333333333333d0) & + + in(i+7,j+6) * (0.00034340659340659343d0) & + + in(i+8,j+6) * (0.00026041666666666666d0) & + + in(i-8,j+7) * (-0.00026041666666666666d0) & + + in(i-7,j+7) * (-0.00034340659340659343d0) & + + in(i-6,j+7) * (-0.0004734848484848485d0) & + + in(i-5,j+7) * (-0.0006944444444444445d0) & + + in(i-4,j+7) * (-0.0011160714285714285d0) & + + in(i-3,j+7) * (-0.0020833333333333333d0) & + + in(i-2,j+7) * (-0.005208333333333333d0) & + + in(i-1,j+7) * (-0.03125d0) & + + in(i+1,j+7) * (0.00034340659340659343d0) & + + in(i+2,j+7) * (0.00034340659340659343d0) & + + in(i+3,j+7) * (0.00034340659340659343d0) & + + in(i+4,j+7) * (0.00034340659340659343d0) & + + in(i+5,j+7) * (0.00034340659340659343d0) & + + in(i+6,j+7) * (0.00034340659340659343d0) & + + in(i+7,j+7) * (0.004464285714285714d0) & + + in(i+8,j+7) * (0.00026041666666666666d0) & + + in(i-8,j+8) * (-0.00026041666666666666d0) & + + in(i-7,j+8) * (-0.00034340659340659343d0) & + + in(i-6,j+8) * (-0.0004734848484848485d0) & + + in(i-5,j+8) * (-0.0006944444444444445d0) & + + in(i-4,j+8) * (-0.0011160714285714285d0) & + + in(i-3,j+8) * (-0.0020833333333333333d0) & + + in(i-2,j+8) * (-0.005208333333333333d0) & + + in(i-1,j+8) * (-0.03125d0) & + + in(i+1,j+8) * (0.00026041666666666666d0) & + + in(i+2,j+8) * (0.00026041666666666666d0) & + + in(i+3,j+8) * (0.00026041666666666666d0) & + + in(i+4,j+8) * (0.00026041666666666666d0) & + + in(i+5,j+8) * (0.00026041666666666666d0) & + + in(i+6,j+8) * (0.00026041666666666666d0) & + + in(i+7,j+8) * (0.00026041666666666666d0) & + + in(i+8,j+8) * (0.00390625d0) & +0.0 end do !$omp end simd @@ -1161,258 +1161,258 @@ subroutine grid9(n, in, out) !$omp simd do j=9,n-9-1 out(i,j) = out(i,j) & - + in(i-9,j-9) * (-0.0030864197530864196) & - + in(i+1,j-9) * (-0.00018155410312273057) & - + in(i+2,j-9) * (-0.00018155410312273057) & - + in(i+3,j-9) * (-0.00018155410312273057) & - + in(i+4,j-9) * (-0.00018155410312273057) & - + in(i+5,j-9) * (-0.00018155410312273057) & - + in(i+6,j-9) * (-0.00018155410312273057) & - + in(i+7,j-9) * (-0.00018155410312273057) & - + in(i+8,j-9) * (-0.00018155410312273057) & - + in(i+9,j-9) * (-0.00018155410312273057) & - + in(i-8,j-8) * (-0.003472222222222222) & - + in(i+1,j-8) * (-0.0002314814814814815) & - + in(i+2,j-8) * (-0.0002314814814814815) & - + in(i+3,j-8) * (-0.0002314814814814815) & - + in(i+4,j-8) * (-0.0002314814814814815) & - + in(i+5,j-8) * (-0.0002314814814814815) & - + in(i+6,j-8) * (-0.0002314814814814815) & - + in(i+7,j-8) * (-0.0002314814814814815) & - + in(i+8,j-8) * (-0.0002314814814814815) & - + in(i+9,j-8) * (-0.0002314814814814815) & - + in(i-7,j-7) * (-0.003968253968253968) & - + in(i+1,j-7) * (-0.00030525030525030525) & - + in(i+2,j-7) * (-0.00030525030525030525) & - + in(i+3,j-7) * (-0.00030525030525030525) & - + in(i+4,j-7) * (-0.00030525030525030525) & - + in(i+5,j-7) * (-0.00030525030525030525) & - + in(i+6,j-7) * (-0.00030525030525030525) & - + in(i+7,j-7) * (-0.00030525030525030525) & - + in(i+8,j-7) * (-0.00030525030525030525) & - + in(i+9,j-7) * (-0.00030525030525030525) & - + in(i-6,j-6) * (-0.004629629629629629) & - + in(i+1,j-6) * (-0.00042087542087542086) & - + in(i+2,j-6) * (-0.00042087542087542086) & - + in(i+3,j-6) * (-0.00042087542087542086) & - + in(i+4,j-6) * (-0.00042087542087542086) & - + in(i+5,j-6) * (-0.00042087542087542086) & - + in(i+6,j-6) * (-0.00042087542087542086) & - + in(i+7,j-6) * (-0.00042087542087542086) & - + in(i+8,j-6) * (-0.00042087542087542086) & - + in(i+9,j-6) * (-0.00042087542087542086) & - + in(i-5,j-5) * (-0.005555555555555556) & - + in(i+1,j-5) * (-0.0006172839506172839) & - + in(i+2,j-5) * (-0.0006172839506172839) & - + in(i+3,j-5) * (-0.0006172839506172839) & - + in(i+4,j-5) * (-0.0006172839506172839) & - + in(i+5,j-5) * (-0.0006172839506172839) & - + in(i+6,j-5) * (-0.0006172839506172839) & - + in(i+7,j-5) * (-0.0006172839506172839) & - + in(i+8,j-5) * (-0.0006172839506172839) & - + in(i+9,j-5) * (-0.0006172839506172839) & - + in(i-4,j-4) * (-0.006944444444444444) & - + in(i+1,j-4) * (-0.000992063492063492) & - + in(i+2,j-4) * (-0.000992063492063492) & - + in(i+3,j-4) * (-0.000992063492063492) & - + in(i+4,j-4) * (-0.000992063492063492) & - + in(i+5,j-4) * (-0.000992063492063492) & - + in(i+6,j-4) * (-0.000992063492063492) & - + in(i+7,j-4) * (-0.000992063492063492) & - + in(i+8,j-4) * (-0.000992063492063492) & - + in(i+9,j-4) * (-0.000992063492063492) & - + in(i-3,j-3) * (-0.009259259259259259) & - + in(i+1,j-3) * (-0.001851851851851852) & - + in(i+2,j-3) * (-0.001851851851851852) & - + in(i+3,j-3) * (-0.001851851851851852) & - + in(i+4,j-3) * (-0.001851851851851852) & - + in(i+5,j-3) * (-0.001851851851851852) & - + in(i+6,j-3) * (-0.001851851851851852) & - + in(i+7,j-3) * (-0.001851851851851852) & - + in(i+8,j-3) * (-0.001851851851851852) & - + in(i+9,j-3) * (-0.001851851851851852) & - + in(i-2,j-2) * (-0.013888888888888888) & - + in(i+1,j-2) * (-0.004629629629629629) & - + in(i+2,j-2) * (-0.004629629629629629) & - + in(i+3,j-2) * (-0.004629629629629629) & - + in(i+4,j-2) * (-0.004629629629629629) & - + in(i+5,j-2) * (-0.004629629629629629) & - + in(i+6,j-2) * (-0.004629629629629629) & - + in(i+7,j-2) * (-0.004629629629629629) & - + in(i+8,j-2) * (-0.004629629629629629) & - + in(i+9,j-2) * (-0.004629629629629629) & - + in(i-1,j-1) * (-0.027777777777777776) & - + in(i+1,j-1) * (-0.027777777777777776) & - + in(i+2,j-1) * (-0.027777777777777776) & - + in(i+3,j-1) * (-0.027777777777777776) & - + in(i+4,j-1) * (-0.027777777777777776) & - + in(i+5,j-1) * (-0.027777777777777776) & - + in(i+6,j-1) * (-0.027777777777777776) & - + in(i+7,j-1) * (-0.027777777777777776) & - + in(i+8,j-1) * (-0.027777777777777776) & - + in(i+9,j-1) * (-0.027777777777777776) & - + in(i-9,j+1) * (-0.00018155410312273057) & - + in(i-8,j+1) * (-0.0002314814814814815) & - + in(i-7,j+1) * (-0.00030525030525030525) & - + in(i-6,j+1) * (-0.00042087542087542086) & - + in(i-5,j+1) * (-0.0006172839506172839) & - + in(i-4,j+1) * (-0.000992063492063492) & - + in(i-3,j+1) * (-0.001851851851851852) & - + in(i-2,j+1) * (-0.004629629629629629) & - + in(i-1,j+1) * (-0.027777777777777776) & - + in(i+1,j+1) * (0.027777777777777776) & - + in(i+2,j+1) * (0.004629629629629629) & - + in(i+3,j+1) * (0.001851851851851852) & - + in(i+4,j+1) * (0.000992063492063492) & - + in(i+5,j+1) * (0.0006172839506172839) & - + in(i+6,j+1) * (0.00042087542087542086) & - + in(i+7,j+1) * (0.00030525030525030525) & - + in(i+8,j+1) * (0.0002314814814814815) & - + in(i+9,j+1) * (0.00018155410312273057) & - + in(i-9,j+2) * (-0.00018155410312273057) & - + in(i-8,j+2) * (-0.0002314814814814815) & - + in(i-7,j+2) * (-0.00030525030525030525) & - + in(i-6,j+2) * (-0.00042087542087542086) & - + in(i-5,j+2) * (-0.0006172839506172839) & - + in(i-4,j+2) * (-0.000992063492063492) & - + in(i-3,j+2) * (-0.001851851851851852) & - + in(i-2,j+2) * (-0.004629629629629629) & - + in(i-1,j+2) * (-0.027777777777777776) & - + in(i+1,j+2) * (0.004629629629629629) & - + in(i+2,j+2) * (0.013888888888888888) & - + in(i+3,j+2) * (0.001851851851851852) & - + in(i+4,j+2) * (0.000992063492063492) & - + in(i+5,j+2) * (0.0006172839506172839) & - + in(i+6,j+2) * (0.00042087542087542086) & - + in(i+7,j+2) * (0.00030525030525030525) & - + in(i+8,j+2) * (0.0002314814814814815) & - + in(i+9,j+2) * (0.00018155410312273057) & - + in(i-9,j+3) * (-0.00018155410312273057) & - + in(i-8,j+3) * (-0.0002314814814814815) & - + in(i-7,j+3) * (-0.00030525030525030525) & - + in(i-6,j+3) * (-0.00042087542087542086) & - + in(i-5,j+3) * (-0.0006172839506172839) & - + in(i-4,j+3) * (-0.000992063492063492) & - + in(i-3,j+3) * (-0.001851851851851852) & - + in(i-2,j+3) * (-0.004629629629629629) & - + in(i-1,j+3) * (-0.027777777777777776) & - + in(i+1,j+3) * (0.001851851851851852) & - + in(i+2,j+3) * (0.001851851851851852) & - + in(i+3,j+3) * (0.009259259259259259) & - + in(i+4,j+3) * (0.000992063492063492) & - + in(i+5,j+3) * (0.0006172839506172839) & - + in(i+6,j+3) * (0.00042087542087542086) & - + in(i+7,j+3) * (0.00030525030525030525) & - + in(i+8,j+3) * (0.0002314814814814815) & - + in(i+9,j+3) * (0.00018155410312273057) & - + in(i-9,j+4) * (-0.00018155410312273057) & - + in(i-8,j+4) * (-0.0002314814814814815) & - + in(i-7,j+4) * (-0.00030525030525030525) & - + in(i-6,j+4) * (-0.00042087542087542086) & - + in(i-5,j+4) * (-0.0006172839506172839) & - + in(i-4,j+4) * (-0.000992063492063492) & - + in(i-3,j+4) * (-0.001851851851851852) & - + in(i-2,j+4) * (-0.004629629629629629) & - + in(i-1,j+4) * (-0.027777777777777776) & - + in(i+1,j+4) * (0.000992063492063492) & - + in(i+2,j+4) * (0.000992063492063492) & - + in(i+3,j+4) * (0.000992063492063492) & - + in(i+4,j+4) * (0.006944444444444444) & - + in(i+5,j+4) * (0.0006172839506172839) & - + in(i+6,j+4) * (0.00042087542087542086) & - + in(i+7,j+4) * (0.00030525030525030525) & - + in(i+8,j+4) * (0.0002314814814814815) & - + in(i+9,j+4) * (0.00018155410312273057) & - + in(i-9,j+5) * (-0.00018155410312273057) & - + in(i-8,j+5) * (-0.0002314814814814815) & - + in(i-7,j+5) * (-0.00030525030525030525) & - + in(i-6,j+5) * (-0.00042087542087542086) & - + in(i-5,j+5) * (-0.0006172839506172839) & - + in(i-4,j+5) * (-0.000992063492063492) & - + in(i-3,j+5) * (-0.001851851851851852) & - + in(i-2,j+5) * (-0.004629629629629629) & - + in(i-1,j+5) * (-0.027777777777777776) & - + in(i+1,j+5) * (0.0006172839506172839) & - + in(i+2,j+5) * (0.0006172839506172839) & - + in(i+3,j+5) * (0.0006172839506172839) & - + in(i+4,j+5) * (0.0006172839506172839) & - + in(i+5,j+5) * (0.005555555555555556) & - + in(i+6,j+5) * (0.00042087542087542086) & - + in(i+7,j+5) * (0.00030525030525030525) & - + in(i+8,j+5) * (0.0002314814814814815) & - + in(i+9,j+5) * (0.00018155410312273057) & - + in(i-9,j+6) * (-0.00018155410312273057) & - + in(i-8,j+6) * (-0.0002314814814814815) & - + in(i-7,j+6) * (-0.00030525030525030525) & - + in(i-6,j+6) * (-0.00042087542087542086) & - + in(i-5,j+6) * (-0.0006172839506172839) & - + in(i-4,j+6) * (-0.000992063492063492) & - + in(i-3,j+6) * (-0.001851851851851852) & - + in(i-2,j+6) * (-0.004629629629629629) & - + in(i-1,j+6) * (-0.027777777777777776) & - + in(i+1,j+6) * (0.00042087542087542086) & - + in(i+2,j+6) * (0.00042087542087542086) & - + in(i+3,j+6) * (0.00042087542087542086) & - + in(i+4,j+6) * (0.00042087542087542086) & - + in(i+5,j+6) * (0.00042087542087542086) & - + in(i+6,j+6) * (0.004629629629629629) & - + in(i+7,j+6) * (0.00030525030525030525) & - + in(i+8,j+6) * (0.0002314814814814815) & - + in(i+9,j+6) * (0.00018155410312273057) & - + in(i-9,j+7) * (-0.00018155410312273057) & - + in(i-8,j+7) * (-0.0002314814814814815) & - + in(i-7,j+7) * (-0.00030525030525030525) & - + in(i-6,j+7) * (-0.00042087542087542086) & - + in(i-5,j+7) * (-0.0006172839506172839) & - + in(i-4,j+7) * (-0.000992063492063492) & - + in(i-3,j+7) * (-0.001851851851851852) & - + in(i-2,j+7) * (-0.004629629629629629) & - + in(i-1,j+7) * (-0.027777777777777776) & - + in(i+1,j+7) * (0.00030525030525030525) & - + in(i+2,j+7) * (0.00030525030525030525) & - + in(i+3,j+7) * (0.00030525030525030525) & - + in(i+4,j+7) * (0.00030525030525030525) & - + in(i+5,j+7) * (0.00030525030525030525) & - + in(i+6,j+7) * (0.00030525030525030525) & - + in(i+7,j+7) * (0.003968253968253968) & - + in(i+8,j+7) * (0.0002314814814814815) & - + in(i+9,j+7) * (0.00018155410312273057) & - + in(i-9,j+8) * (-0.00018155410312273057) & - + in(i-8,j+8) * (-0.0002314814814814815) & - + in(i-7,j+8) * (-0.00030525030525030525) & - + in(i-6,j+8) * (-0.00042087542087542086) & - + in(i-5,j+8) * (-0.0006172839506172839) & - + in(i-4,j+8) * (-0.000992063492063492) & - + in(i-3,j+8) * (-0.001851851851851852) & - + in(i-2,j+8) * (-0.004629629629629629) & - + in(i-1,j+8) * (-0.027777777777777776) & - + in(i+1,j+8) * (0.0002314814814814815) & - + in(i+2,j+8) * (0.0002314814814814815) & - + in(i+3,j+8) * (0.0002314814814814815) & - + in(i+4,j+8) * (0.0002314814814814815) & - + in(i+5,j+8) * (0.0002314814814814815) & - + in(i+6,j+8) * (0.0002314814814814815) & - + in(i+7,j+8) * (0.0002314814814814815) & - + in(i+8,j+8) * (0.003472222222222222) & - + in(i+9,j+8) * (0.00018155410312273057) & - + in(i-9,j+9) * (-0.00018155410312273057) & - + in(i-8,j+9) * (-0.0002314814814814815) & - + in(i-7,j+9) * (-0.00030525030525030525) & - + in(i-6,j+9) * (-0.00042087542087542086) & - + in(i-5,j+9) * (-0.0006172839506172839) & - + in(i-4,j+9) * (-0.000992063492063492) & - + in(i-3,j+9) * (-0.001851851851851852) & - + in(i-2,j+9) * (-0.004629629629629629) & - + in(i-1,j+9) * (-0.027777777777777776) & - + in(i+1,j+9) * (0.00018155410312273057) & - + in(i+2,j+9) * (0.00018155410312273057) & - + in(i+3,j+9) * (0.00018155410312273057) & - + in(i+4,j+9) * (0.00018155410312273057) & - + in(i+5,j+9) * (0.00018155410312273057) & - + in(i+6,j+9) * (0.00018155410312273057) & - + in(i+7,j+9) * (0.00018155410312273057) & - + in(i+8,j+9) * (0.00018155410312273057) & - + in(i+9,j+9) * (0.0030864197530864196) & + + in(i-9,j-9) * (-0.0030864197530864196d0) & + + in(i+1,j-9) * (-0.00018155410312273057d0) & + + in(i+2,j-9) * (-0.00018155410312273057d0) & + + in(i+3,j-9) * (-0.00018155410312273057d0) & + + in(i+4,j-9) * (-0.00018155410312273057d0) & + + in(i+5,j-9) * (-0.00018155410312273057d0) & + + in(i+6,j-9) * (-0.00018155410312273057d0) & + + in(i+7,j-9) * (-0.00018155410312273057d0) & + + in(i+8,j-9) * (-0.00018155410312273057d0) & + + in(i+9,j-9) * (-0.00018155410312273057d0) & + + in(i-8,j-8) * (-0.003472222222222222d0) & + + in(i+1,j-8) * (-0.0002314814814814815d0) & + + in(i+2,j-8) * (-0.0002314814814814815d0) & + + in(i+3,j-8) * (-0.0002314814814814815d0) & + + in(i+4,j-8) * (-0.0002314814814814815d0) & + + in(i+5,j-8) * (-0.0002314814814814815d0) & + + in(i+6,j-8) * (-0.0002314814814814815d0) & + + in(i+7,j-8) * (-0.0002314814814814815d0) & + + in(i+8,j-8) * (-0.0002314814814814815d0) & + + in(i+9,j-8) * (-0.0002314814814814815d0) & + + in(i-7,j-7) * (-0.003968253968253968d0) & + + in(i+1,j-7) * (-0.00030525030525030525d0) & + + in(i+2,j-7) * (-0.00030525030525030525d0) & + + in(i+3,j-7) * (-0.00030525030525030525d0) & + + in(i+4,j-7) * (-0.00030525030525030525d0) & + + in(i+5,j-7) * (-0.00030525030525030525d0) & + + in(i+6,j-7) * (-0.00030525030525030525d0) & + + in(i+7,j-7) * (-0.00030525030525030525d0) & + + in(i+8,j-7) * (-0.00030525030525030525d0) & + + in(i+9,j-7) * (-0.00030525030525030525d0) & + + in(i-6,j-6) * (-0.004629629629629629d0) & + + in(i+1,j-6) * (-0.00042087542087542086d0) & + + in(i+2,j-6) * (-0.00042087542087542086d0) & + + in(i+3,j-6) * (-0.00042087542087542086d0) & + + in(i+4,j-6) * (-0.00042087542087542086d0) & + + in(i+5,j-6) * (-0.00042087542087542086d0) & + + in(i+6,j-6) * (-0.00042087542087542086d0) & + + in(i+7,j-6) * (-0.00042087542087542086d0) & + + in(i+8,j-6) * (-0.00042087542087542086d0) & + + in(i+9,j-6) * (-0.00042087542087542086d0) & + + in(i-5,j-5) * (-0.005555555555555556d0) & + + in(i+1,j-5) * (-0.0006172839506172839d0) & + + in(i+2,j-5) * (-0.0006172839506172839d0) & + + in(i+3,j-5) * (-0.0006172839506172839d0) & + + in(i+4,j-5) * (-0.0006172839506172839d0) & + + in(i+5,j-5) * (-0.0006172839506172839d0) & + + in(i+6,j-5) * (-0.0006172839506172839d0) & + + in(i+7,j-5) * (-0.0006172839506172839d0) & + + in(i+8,j-5) * (-0.0006172839506172839d0) & + + in(i+9,j-5) * (-0.0006172839506172839d0) & + + in(i-4,j-4) * (-0.006944444444444444d0) & + + in(i+1,j-4) * (-0.000992063492063492d0) & + + in(i+2,j-4) * (-0.000992063492063492d0) & + + in(i+3,j-4) * (-0.000992063492063492d0) & + + in(i+4,j-4) * (-0.000992063492063492d0) & + + in(i+5,j-4) * (-0.000992063492063492d0) & + + in(i+6,j-4) * (-0.000992063492063492d0) & + + in(i+7,j-4) * (-0.000992063492063492d0) & + + in(i+8,j-4) * (-0.000992063492063492d0) & + + in(i+9,j-4) * (-0.000992063492063492d0) & + + in(i-3,j-3) * (-0.009259259259259259d0) & + + in(i+1,j-3) * (-0.001851851851851852d0) & + + in(i+2,j-3) * (-0.001851851851851852d0) & + + in(i+3,j-3) * (-0.001851851851851852d0) & + + in(i+4,j-3) * (-0.001851851851851852d0) & + + in(i+5,j-3) * (-0.001851851851851852d0) & + + in(i+6,j-3) * (-0.001851851851851852d0) & + + in(i+7,j-3) * (-0.001851851851851852d0) & + + in(i+8,j-3) * (-0.001851851851851852d0) & + + in(i+9,j-3) * (-0.001851851851851852d0) & + + in(i-2,j-2) * (-0.013888888888888888d0) & + + in(i+1,j-2) * (-0.004629629629629629d0) & + + in(i+2,j-2) * (-0.004629629629629629d0) & + + in(i+3,j-2) * (-0.004629629629629629d0) & + + in(i+4,j-2) * (-0.004629629629629629d0) & + + in(i+5,j-2) * (-0.004629629629629629d0) & + + in(i+6,j-2) * (-0.004629629629629629d0) & + + in(i+7,j-2) * (-0.004629629629629629d0) & + + in(i+8,j-2) * (-0.004629629629629629d0) & + + in(i+9,j-2) * (-0.004629629629629629d0) & + + in(i-1,j-1) * (-0.027777777777777776d0) & + + in(i+1,j-1) * (-0.027777777777777776d0) & + + in(i+2,j-1) * (-0.027777777777777776d0) & + + in(i+3,j-1) * (-0.027777777777777776d0) & + + in(i+4,j-1) * (-0.027777777777777776d0) & + + in(i+5,j-1) * (-0.027777777777777776d0) & + + in(i+6,j-1) * (-0.027777777777777776d0) & + + in(i+7,j-1) * (-0.027777777777777776d0) & + + in(i+8,j-1) * (-0.027777777777777776d0) & + + in(i+9,j-1) * (-0.027777777777777776d0) & + + in(i-9,j+1) * (-0.00018155410312273057d0) & + + in(i-8,j+1) * (-0.0002314814814814815d0) & + + in(i-7,j+1) * (-0.00030525030525030525d0) & + + in(i-6,j+1) * (-0.00042087542087542086d0) & + + in(i-5,j+1) * (-0.0006172839506172839d0) & + + in(i-4,j+1) * (-0.000992063492063492d0) & + + in(i-3,j+1) * (-0.001851851851851852d0) & + + in(i-2,j+1) * (-0.004629629629629629d0) & + + in(i-1,j+1) * (-0.027777777777777776d0) & + + in(i+1,j+1) * (0.027777777777777776d0) & + + in(i+2,j+1) * (0.004629629629629629d0) & + + in(i+3,j+1) * (0.001851851851851852d0) & + + in(i+4,j+1) * (0.000992063492063492d0) & + + in(i+5,j+1) * (0.0006172839506172839d0) & + + in(i+6,j+1) * (0.00042087542087542086d0) & + + in(i+7,j+1) * (0.00030525030525030525d0) & + + in(i+8,j+1) * (0.0002314814814814815d0) & + + in(i+9,j+1) * (0.00018155410312273057d0) & + + in(i-9,j+2) * (-0.00018155410312273057d0) & + + in(i-8,j+2) * (-0.0002314814814814815d0) & + + in(i-7,j+2) * (-0.00030525030525030525d0) & + + in(i-6,j+2) * (-0.00042087542087542086d0) & + + in(i-5,j+2) * (-0.0006172839506172839d0) & + + in(i-4,j+2) * (-0.000992063492063492d0) & + + in(i-3,j+2) * (-0.001851851851851852d0) & + + in(i-2,j+2) * (-0.004629629629629629d0) & + + in(i-1,j+2) * (-0.027777777777777776d0) & + + in(i+1,j+2) * (0.004629629629629629d0) & + + in(i+2,j+2) * (0.013888888888888888d0) & + + in(i+3,j+2) * (0.001851851851851852d0) & + + in(i+4,j+2) * (0.000992063492063492d0) & + + in(i+5,j+2) * (0.0006172839506172839d0) & + + in(i+6,j+2) * (0.00042087542087542086d0) & + + in(i+7,j+2) * (0.00030525030525030525d0) & + + in(i+8,j+2) * (0.0002314814814814815d0) & + + in(i+9,j+2) * (0.00018155410312273057d0) & + + in(i-9,j+3) * (-0.00018155410312273057d0) & + + in(i-8,j+3) * (-0.0002314814814814815d0) & + + in(i-7,j+3) * (-0.00030525030525030525d0) & + + in(i-6,j+3) * (-0.00042087542087542086d0) & + + in(i-5,j+3) * (-0.0006172839506172839d0) & + + in(i-4,j+3) * (-0.000992063492063492d0) & + + in(i-3,j+3) * (-0.001851851851851852d0) & + + in(i-2,j+3) * (-0.004629629629629629d0) & + + in(i-1,j+3) * (-0.027777777777777776d0) & + + in(i+1,j+3) * (0.001851851851851852d0) & + + in(i+2,j+3) * (0.001851851851851852d0) & + + in(i+3,j+3) * (0.009259259259259259d0) & + + in(i+4,j+3) * (0.000992063492063492d0) & + + in(i+5,j+3) * (0.0006172839506172839d0) & + + in(i+6,j+3) * (0.00042087542087542086d0) & + + in(i+7,j+3) * (0.00030525030525030525d0) & + + in(i+8,j+3) * (0.0002314814814814815d0) & + + in(i+9,j+3) * (0.00018155410312273057d0) & + + in(i-9,j+4) * (-0.00018155410312273057d0) & + + in(i-8,j+4) * (-0.0002314814814814815d0) & + + in(i-7,j+4) * (-0.00030525030525030525d0) & + + in(i-6,j+4) * (-0.00042087542087542086d0) & + + in(i-5,j+4) * (-0.0006172839506172839d0) & + + in(i-4,j+4) * (-0.000992063492063492d0) & + + in(i-3,j+4) * (-0.001851851851851852d0) & + + in(i-2,j+4) * (-0.004629629629629629d0) & + + in(i-1,j+4) * (-0.027777777777777776d0) & + + in(i+1,j+4) * (0.000992063492063492d0) & + + in(i+2,j+4) * (0.000992063492063492d0) & + + in(i+3,j+4) * (0.000992063492063492d0) & + + in(i+4,j+4) * (0.006944444444444444d0) & + + in(i+5,j+4) * (0.0006172839506172839d0) & + + in(i+6,j+4) * (0.00042087542087542086d0) & + + in(i+7,j+4) * (0.00030525030525030525d0) & + + in(i+8,j+4) * (0.0002314814814814815d0) & + + in(i+9,j+4) * (0.00018155410312273057d0) & + + in(i-9,j+5) * (-0.00018155410312273057d0) & + + in(i-8,j+5) * (-0.0002314814814814815d0) & + + in(i-7,j+5) * (-0.00030525030525030525d0) & + + in(i-6,j+5) * (-0.00042087542087542086d0) & + + in(i-5,j+5) * (-0.0006172839506172839d0) & + + in(i-4,j+5) * (-0.000992063492063492d0) & + + in(i-3,j+5) * (-0.001851851851851852d0) & + + in(i-2,j+5) * (-0.004629629629629629d0) & + + in(i-1,j+5) * (-0.027777777777777776d0) & + + in(i+1,j+5) * (0.0006172839506172839d0) & + + in(i+2,j+5) * (0.0006172839506172839d0) & + + in(i+3,j+5) * (0.0006172839506172839d0) & + + in(i+4,j+5) * (0.0006172839506172839d0) & + + in(i+5,j+5) * (0.005555555555555556d0) & + + in(i+6,j+5) * (0.00042087542087542086d0) & + + in(i+7,j+5) * (0.00030525030525030525d0) & + + in(i+8,j+5) * (0.0002314814814814815d0) & + + in(i+9,j+5) * (0.00018155410312273057d0) & + + in(i-9,j+6) * (-0.00018155410312273057d0) & + + in(i-8,j+6) * (-0.0002314814814814815d0) & + + in(i-7,j+6) * (-0.00030525030525030525d0) & + + in(i-6,j+6) * (-0.00042087542087542086d0) & + + in(i-5,j+6) * (-0.0006172839506172839d0) & + + in(i-4,j+6) * (-0.000992063492063492d0) & + + in(i-3,j+6) * (-0.001851851851851852d0) & + + in(i-2,j+6) * (-0.004629629629629629d0) & + + in(i-1,j+6) * (-0.027777777777777776d0) & + + in(i+1,j+6) * (0.00042087542087542086d0) & + + in(i+2,j+6) * (0.00042087542087542086d0) & + + in(i+3,j+6) * (0.00042087542087542086d0) & + + in(i+4,j+6) * (0.00042087542087542086d0) & + + in(i+5,j+6) * (0.00042087542087542086d0) & + + in(i+6,j+6) * (0.004629629629629629d0) & + + in(i+7,j+6) * (0.00030525030525030525d0) & + + in(i+8,j+6) * (0.0002314814814814815d0) & + + in(i+9,j+6) * (0.00018155410312273057d0) & + + in(i-9,j+7) * (-0.00018155410312273057d0) & + + in(i-8,j+7) * (-0.0002314814814814815d0) & + + in(i-7,j+7) * (-0.00030525030525030525d0) & + + in(i-6,j+7) * (-0.00042087542087542086d0) & + + in(i-5,j+7) * (-0.0006172839506172839d0) & + + in(i-4,j+7) * (-0.000992063492063492d0) & + + in(i-3,j+7) * (-0.001851851851851852d0) & + + in(i-2,j+7) * (-0.004629629629629629d0) & + + in(i-1,j+7) * (-0.027777777777777776d0) & + + in(i+1,j+7) * (0.00030525030525030525d0) & + + in(i+2,j+7) * (0.00030525030525030525d0) & + + in(i+3,j+7) * (0.00030525030525030525d0) & + + in(i+4,j+7) * (0.00030525030525030525d0) & + + in(i+5,j+7) * (0.00030525030525030525d0) & + + in(i+6,j+7) * (0.00030525030525030525d0) & + + in(i+7,j+7) * (0.003968253968253968d0) & + + in(i+8,j+7) * (0.0002314814814814815d0) & + + in(i+9,j+7) * (0.00018155410312273057d0) & + + in(i-9,j+8) * (-0.00018155410312273057d0) & + + in(i-8,j+8) * (-0.0002314814814814815d0) & + + in(i-7,j+8) * (-0.00030525030525030525d0) & + + in(i-6,j+8) * (-0.00042087542087542086d0) & + + in(i-5,j+8) * (-0.0006172839506172839d0) & + + in(i-4,j+8) * (-0.000992063492063492d0) & + + in(i-3,j+8) * (-0.001851851851851852d0) & + + in(i-2,j+8) * (-0.004629629629629629d0) & + + in(i-1,j+8) * (-0.027777777777777776d0) & + + in(i+1,j+8) * (0.0002314814814814815d0) & + + in(i+2,j+8) * (0.0002314814814814815d0) & + + in(i+3,j+8) * (0.0002314814814814815d0) & + + in(i+4,j+8) * (0.0002314814814814815d0) & + + in(i+5,j+8) * (0.0002314814814814815d0) & + + in(i+6,j+8) * (0.0002314814814814815d0) & + + in(i+7,j+8) * (0.0002314814814814815d0) & + + in(i+8,j+8) * (0.003472222222222222d0) & + + in(i+9,j+8) * (0.00018155410312273057d0) & + + in(i-9,j+9) * (-0.00018155410312273057d0) & + + in(i-8,j+9) * (-0.0002314814814814815d0) & + + in(i-7,j+9) * (-0.00030525030525030525d0) & + + in(i-6,j+9) * (-0.00042087542087542086d0) & + + in(i-5,j+9) * (-0.0006172839506172839d0) & + + in(i-4,j+9) * (-0.000992063492063492d0) & + + in(i-3,j+9) * (-0.001851851851851852d0) & + + in(i-2,j+9) * (-0.004629629629629629d0) & + + in(i-1,j+9) * (-0.027777777777777776d0) & + + in(i+1,j+9) * (0.00018155410312273057d0) & + + in(i+2,j+9) * (0.00018155410312273057d0) & + + in(i+3,j+9) * (0.00018155410312273057d0) & + + in(i+4,j+9) * (0.00018155410312273057d0) & + + in(i+5,j+9) * (0.00018155410312273057d0) & + + in(i+6,j+9) * (0.00018155410312273057d0) & + + in(i+7,j+9) * (0.00018155410312273057d0) & + + in(i+8,j+9) * (0.00018155410312273057d0) & + + in(i+9,j+9) * (0.0030864197530864196d0) & +0.0 end do !$omp end simd From fcec426bd6eedbd501a059d7de931295ef7f68f1 Mon Sep 17 00:00:00 2001 From: Elliott Slaughter Date: Wed, 25 Oct 2017 13:20:18 -0700 Subject: [PATCH 182/245] Fix for mapper interface update. --- LEGION/Stencil/stencil.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/LEGION/Stencil/stencil.cc b/LEGION/Stencil/stencil.cc index bd09ba174..99b2a0666 100644 --- a/LEGION/Stencil/stencil.cc +++ b/LEGION/Stencil/stencil.cc @@ -96,7 +96,8 @@ class StencilMapper : public DefaultMapper const MapMustEpochInput& input, MapMustEpochOutput& output); virtual Memory default_policy_select_target_memory(MapperContext ctx, - Processor target_proc); + Processor target_proc, + const RegionRequirement &req); private: //std::vector& procs_list; std::vector& sysmems_list; @@ -119,7 +120,8 @@ StencilMapper::StencilMapper(MapperRuntime *rt, Machine machine, Processor local } Memory StencilMapper::default_policy_select_target_memory(MapperContext ctx, - Processor target_proc) + Processor target_proc, + const RegionRequirement &req) { return proc_sysmems[target_proc]; } From 80d3a11c9eeaf2b43e1d17b2e559eb52a4b8501b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 21 Mar 2018 13:07:32 -0700 Subject: [PATCH 183/245] do to SYCL what we have for OpenCL --- Cxx11/nstream-sycl.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index bebfb5932..2a8c83548 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -123,6 +123,10 @@ void run(cl::sycl::queue & q, int iterations, size_t length) std::cout << e.what() << std::endl; return; } + catch (const char * e) { + std::cout << e << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -262,9 +266,15 @@ int main(int argc, char * argv[]) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; + return 1; } catch (std::exception e) { std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; } return 0; From af7f70a937195898dd51661a2e689aeb2b5d7307 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 22 Mar 2018 06:14:10 -0700 Subject: [PATCH 184/245] fix name mangling issue - thanks Rod@CodePlay! --- Cxx11/nstream-sycl.cc | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 2a8c83548..8969fdf3a 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -117,6 +117,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +<<<<<<< HEAD return; } catch (std::exception e) { @@ -125,6 +126,8 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (const char * e) { std::cout << e << std::endl; +======= +>>>>>>> fix name mangling issue - thanks Rod@CodePlay! return; } @@ -207,14 +210,22 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { +<<<<<<< HEAD if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL auto device = host.get_device(); +======= + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); + if (1) { + auto device = cpu.get_device(); + auto platform = device.get_platform(); +>>>>>>> fix name mangling issue - thanks Rod@CodePlay! std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +<<<<<<< HEAD #endif run(host, iterations, length); @@ -226,9 +237,22 @@ int main(int argc, char * argv[]) cl::sycl::queue cpu(cl::sycl::cpu_selector{}); #ifndef TRISYCL auto device = cpu.get_device(); +======= + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + + run(cpu, iterations, length); + run(cpu, iterations, length); + } + + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); + if (1) { + auto device = gpu.get_device(); + auto platform = device.get_platform(); +>>>>>>> fix name mangling issue - thanks Rod@CodePlay! std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +<<<<<<< HEAD bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); #else bool has_spir = true; // ? @@ -262,10 +286,17 @@ int main(int argc, char * argv[]) run(gpu, iterations, length); #endif } +======= + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + + run(gpu, iterations, length); + run(gpu, iterations, length); +>>>>>>> fix name mangling issue - thanks Rod@CodePlay! } } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +<<<<<<< HEAD return 1; } catch (std::exception e) { @@ -274,6 +305,8 @@ int main(int argc, char * argv[]) } catch (const char * e) { std::cout << e << std::endl; +======= +>>>>>>> fix name mangling issue - thanks Rod@CodePlay! return 1; } From 0aa743c4eb560b8bc4f715b1b7b50488654fa658 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 22 Mar 2018 16:32:20 -0700 Subject: [PATCH 185/245] hard-code SYCL to CPU execution only due to GPU issues the bandwidth reported is consistent for elements, not bytes, which means that something is wrong. 64b data should not lead to BW that is 2x 32b data... --- Cxx11/nstream-sycl.cc | 45 +++---------------------------------------- 1 file changed, 3 insertions(+), 42 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 8969fdf3a..0142d7913 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -117,7 +117,6 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; -<<<<<<< HEAD return; } catch (std::exception e) { @@ -126,8 +125,6 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (const char * e) { std::cout << e << std::endl; -======= ->>>>>>> fix name mangling issue - thanks Rod@CodePlay! return; } @@ -210,24 +207,15 @@ int main(int argc, char * argv[]) ////////////////////////////////////////////////////////////////////// try { -<<<<<<< HEAD if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL auto device = host.get_device(); -======= - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); - if (1) { - auto device = cpu.get_device(); auto platform = device.get_platform(); ->>>>>>> fix name mangling issue - thanks Rod@CodePlay! std::cout << "SYCL Device: " << device.get_info() << std::endl; - auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -<<<<<<< HEAD #endif - run(host, iterations, length); run(host, iterations, length); } @@ -237,30 +225,12 @@ int main(int argc, char * argv[]) cl::sycl::queue cpu(cl::sycl::cpu_selector{}); #ifndef TRISYCL auto device = cpu.get_device(); -======= - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; - - run(cpu, iterations, length); - run(cpu, iterations, length); - } - - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); - if (1) { - auto device = gpu.get_device(); auto platform = device.get_platform(); ->>>>>>> fix name mangling issue - thanks Rod@CodePlay! std::cout << "SYCL Device: " << device.get_info() << std::endl; - auto platform = device.get_platform(); std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -<<<<<<< HEAD - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? #endif - if (has_spir) { - run(cpu, iterations, length); - run(cpu, iterations, length); - } + run(cpu, iterations, length); + run(cpu, iterations, length); } // NVIDIA GPU requires ptx64 target and does not work very well @@ -268,8 +238,8 @@ int main(int argc, char * argv[]) cl::sycl::queue gpu(cl::sycl::gpu_selector{}); #ifndef TRISYCL auto device = gpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); #else @@ -286,17 +256,10 @@ int main(int argc, char * argv[]) run(gpu, iterations, length); #endif } -======= - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; - - run(gpu, iterations, length); - run(gpu, iterations, length); ->>>>>>> fix name mangling issue - thanks Rod@CodePlay! } } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; -<<<<<<< HEAD return 1; } catch (std::exception e) { @@ -305,8 +268,6 @@ int main(int argc, char * argv[]) } catch (const char * e) { std::cout << e << std::endl; -======= ->>>>>>> fix name mangling issue - thanks Rod@CodePlay! return 1; } From 29d05b6c8be06751eec6d954b05f0d65d252d38f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 22 May 2018 08:23:07 -0700 Subject: [PATCH 186/245] add host, catch std exception --- Cxx11/nstream-sycl.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 0142d7913..d3ddbeab6 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -270,6 +270,9 @@ int main(int argc, char * argv[]) std::cout << e << std::endl; return 1; } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } return 0; } From b9534313fa77417139f690a8a456683d581881c6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 31 May 2018 11:20:54 -0700 Subject: [PATCH 187/245] c++1z instead of c++17 --- travis/build-run-prk.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 962ecc1f4..1f40f02a8 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -668,9 +668,9 @@ case "$PRK_TARGET" in SYCLDIR=${TRAVIS_ROOT}/triSYCL if [ "${CC}" = "clang" ] ; then # SYCL will compile without OpenMP - echo "SYCLCXX=${PRK_CXX} -pthread -std=c++17" >> common/make.defs + echo "SYCLCXX=${PRK_CXX} -pthread -std=c++1z" >> common/make.defs else - echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++17" >> common/make.defs + echo "SYCLCXX=${PRK_CXX} -fopenmp -std=c++1z" >> common/make.defs fi echo "SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/include" >> common/make.defs ${MAKE} -C $PRK_TARGET_PATH p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl From 33d5286576e1a0eff3a3e180f7ecdcf362200f98 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Mar 2019 11:00:04 -0800 Subject: [PATCH 188/245] add list platforms --- Cxx11/nstream-opencl.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 40b76d4cc..4ef40bd64 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -190,6 +190,8 @@ int main(int argc, char* argv[]) /// Setup OpenCL environment ////////////////////////////////////////////////////////////////////// + prk::opencl::listPlatforms(); + cl_int err = CL_SUCCESS; cl::Context cpu(CL_DEVICE_TYPE_CPU, NULL, NULL, NULL, &err); From 678ef8c652072d4aae41c553d0a457872daf9ad0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 3 Mar 2019 11:00:41 -0800 Subject: [PATCH 189/245] add OpenCL info and SYCL exception parsing --- Cxx11/nstream-sycl.cc | 55 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index d3ddbeab6..26025943c 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -66,6 +66,11 @@ #include "prk_util.h" +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + // need to declare kernel class as template // to prevent name mangling conflict below template class nstream; @@ -89,9 +94,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length) try { - cl::sycl::buffer d_A { h_A.data(), h_A.size() }; - cl::sycl::buffer d_B { h_B.data(), h_B.size() }; - cl::sycl::buffer d_C { h_C.data(), h_C.size() }; + cl::sycl::buffer d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) }; + cl::sycl::buffer d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) }; + cl::sycl::buffer d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) }; for (int iter = 0; iter<=iterations; ++iter) { @@ -117,6 +122,11 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; return; } catch (std::exception e) { @@ -206,8 +216,11 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// - try { +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + try { if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -219,7 +232,20 @@ int main(int argc, char * argv[]) run(host, iterations, length); run(host, iterations, length); } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } + try { // CPU requires spir64 target if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); @@ -232,7 +258,20 @@ int main(int argc, char * argv[]) run(cpu, iterations, length); run(cpu, iterations, length); } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } + try { // NVIDIA GPU requires ptx64 target and does not work very well if (1) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); @@ -260,6 +299,11 @@ int main(int argc, char * argv[]) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; return 1; } catch (std::exception e) { @@ -270,9 +314,6 @@ int main(int argc, char * argv[]) std::cout << e << std::endl; return 1; } - catch (std::exception e) { - std::cout << e.what() << std::endl; - } return 0; } From 05f30bdff6bd6f5ddbf787abca6935fb96061f95 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 13:10:29 -0700 Subject: [PATCH 190/245] SYCL with explicit date movement --- Cxx11/Makefile | 2 +- Cxx11/nstream-sycl-explicit.cc | 337 +++++++++++++++++++++++++++++++ Cxx11/transpose-sycl-explicit.cc | 299 +++++++++++++++++++++++++++ 3 files changed, 637 insertions(+), 1 deletion(-) create mode 100644 Cxx11/nstream-sycl-explicit.cc create mode 100644 Cxx11/transpose-sycl-explicit.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index b166d65d4..d1c945ea6 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -105,7 +105,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl +sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-sycl-explicit nstream-sycl-explicit tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ p2p-hyperplane-vector-tbb p2p-tasks-tbb diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc new file mode 100644 index 000000000..2f177db35 --- /dev/null +++ b/Cxx11/nstream-sycl-explicit.cc @@ -0,0 +1,337 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "CL/sycl.hpp" +#include "prk_util.h" + +#define PREBUILD_KERNEL 1 + +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + +// need to declare kernel class as template +// to prevent name mangling conflict below +template class nstream; + +template +void run(cl::sycl::queue & q, int iterations, size_t length) +{ + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time(0); + + const T scalar(3); + + std::vector h_A(length,0); + + try { + +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + + cl::sycl::buffer d_A { cl::sycl::range<1>{length} }; + cl::sycl::buffer d_B { cl::sycl::range<1>{length} }; + cl::sycl::buffer d_C { cl::sycl::range<1>{length} }; + + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(A,(T)0); + }); + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(B,(T)2); + }); + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(C,(T)2); + }); + q.wait(); + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) nstream_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + A[i] += B[i] + scalar * C[i]; + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + nstream_time = prk::wtime() - nstream_time; + + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.copy(A,h_A.data()); + }); + q.wait(); + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + return; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } + catch (const char * e) { + std::cout << e << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + T ar(0); + T br(2); + T cr(2); + for (int i=0; i<=iterations; ++i) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + + try { + if (length<100000) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + run(host, iterations, length); + run(host, iterations, length); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? +#endif + if (has_spir) { + run(cpu, iterations, length); + run(cpu, iterations, length); + } + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (1) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } +#endif + } + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + return 1; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc new file mode 100644 index 000000000..8b09da622 --- /dev/null +++ b/Cxx11/transpose-sycl-explicit.cc @@ -0,0 +1,299 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "CL/sycl.hpp" +#include "prk_util.h" + +#define PREBUILD_KERNEL 1 + +// need to declare kernel class as template +// to prevent name mangling conflict below +template class iota; +template class transpose; + +template +void run(cl::sycl::queue & q, int iterations, size_t order) +{ + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time(0); + + std::vector h_B(order*order,(T)0); + + try { + +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + +#if USE_2D_INDEXING + cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); + cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); +#else + cl::sycl::buffer d_A { cl::sycl::range<1>{order*order} }; + cl::sycl::buffer d_B { cl::sycl::range<1>{order*order} }; +#endif + + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) { + A[i] = i[0] * order + i[1]; + }); +#else + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + h.parallel_for>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) { + A[i] = i[0]; + }); +#endif + }); + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + h.fill(B,(T)0); + }); + q.wait(); + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) trans_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + +#if USE_2D_INDEXING + cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { +#if USE_2D_INDEXING + cl::sycl::id<2> ij{it[0],it[1]}; + cl::sycl::id<2> ji{it[1],it[0]}; + B[ij] += A[ji]; + A[ji] += (T)1; +#else + B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; + A[it[1] * order + it[0]] += (T)1; +#endif + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + trans_time = prk::wtime() - trans_time; + + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + h.copy(B,h_B.data()); + }); + q.wait(); + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + return; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const T addit = (iterations+1.) * (iterations/2.); + double abserr(0); + for (size_t i=0; i(ij)*(1.+iterations)+addit; + abserr += std::fabs(h_B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const double epsilon(1.0e-8); + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + double avgtime = trans_time/iterations; + double bytes = (size_t)order * (size_t)order * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + try { + + if (1) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + + run(host, iterations, order); + run(host, iterations, order); + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(cpu, iterations, order); + run(cpu, iterations, order); + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (0) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(gpu, iterations, order); + run(gpu, iterations, order); + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } + + return 0; +} + + From 11c06ca00e7a7d6af14e7ce04b0543e365f22dbf Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 13:12:13 -0700 Subject: [PATCH 191/245] reconcile structure with explicit copy versions --- Cxx11/nstream-sycl.cc | 72 ++++++++++++++++++++--------------------- Cxx11/transpose-sycl.cc | 22 +++++++++---- 2 files changed, 50 insertions(+), 44 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 26025943c..277f9435e 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -63,9 +63,10 @@ ////////////////////////////////////////////////////////////////////// #include "CL/sycl.hpp" - #include "prk_util.h" +#define PREBUILD_KERNEL 1 + #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 @@ -84,16 +85,19 @@ void run(cl::sycl::queue & q, int iterations, size_t length) double nstream_time(0); + const T scalar(3); + std::vector h_A(length,0); std::vector h_B(length,2); std::vector h_C(length,2); - auto range = prk::range(static_cast(0), length); - - const T scalar(3); - try { +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + cl::sycl::buffer d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) }; cl::sycl::buffer d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) }; cl::sycl::buffer d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) }; @@ -108,7 +112,11 @@ void run(cl::sycl::queue & q, int iterations, size_t length) auto B = d_B.template get_access(h); auto C = d_C.template get_access(h); - h.parallel_for>(cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { A[i] += B[i] + scalar * C[i]; }); }); @@ -221,7 +229,7 @@ int main(int argc, char * argv[]) #endif try { - if (1) { + if (length<100000) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL auto device = host.get_device(); @@ -231,21 +239,10 @@ int main(int argc, char * argv[]) #endif run(host, iterations, length); run(host, iterations, length); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - } - try { // CPU requires spir64 target if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); @@ -254,24 +251,16 @@ int main(int argc, char * argv[]) auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - run(cpu, iterations, length); - run(cpu, iterations, length); + if (has_spir) { + run(cpu, iterations, length); + run(cpu, iterations, length); + } } - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - } - try { // NVIDIA GPU requires ptx64 target and does not work very well if (1) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); @@ -281,18 +270,27 @@ int main(int argc, char * argv[]) std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); #else bool has_spir = true; // ? + bool has_fp64 = true; #endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } if (has_spir) { run(gpu, iterations, length); - run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } } else { std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; #ifdef __COMPUTECPP__ std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; run(gpu, iterations, length); - run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } #endif } } diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index e7b1d94d2..a0fa97d00 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -50,9 +50,10 @@ ////////////////////////////////////////////////////////////////////// #include "CL/sycl.hpp" - #include "prk_util.h" +#define PREBUILD_KERNEL 1 + // need to declare kernel class as template // to prevent name mangling conflict below template class transpose; @@ -67,13 +68,18 @@ void run(cl::sycl::queue & q, int iterations, size_t order) double trans_time(0); std::vector h_A(order*order); - std::vector h_B(order*order,static_cast(0)); + std::vector h_B(order*order,(T)0); // fill A with the sequence 0 to order^2-1 as doubles std::iota(h_A.begin(), h_A.end(), static_cast(0)); try { +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + #if USE_2D_INDEXING cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); @@ -92,16 +98,19 @@ void run(cl::sycl::queue & q, int iterations, size_t order) auto A = d_A.template get_access(h); auto B = d_B.template get_access(h); - // transpose - h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { #if USE_2D_INDEXING cl::sycl::id<2> ij{it[0],it[1]}; cl::sycl::id<2> ji{it[1],it[0]}; B[ij] += A[ji]; - A[ji] += static_cast(1); + A[ji] += (T)1; #else B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; - A[it[1] * order + it[0]] += static_cast(1); + A[it[1] * order + it[0]] += (T)1; #endif }); }); @@ -238,7 +247,6 @@ int main(int argc, char * argv[]) std::cout << "SYCL Platform: " << platform.get_info() << std::endl; //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; #endif - run(gpu, iterations, order); run(gpu, iterations, order); } From 8f431b1e68381b485c63c03a5457962659a227ea Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 13:18:52 -0700 Subject: [PATCH 192/245] fixing things --- Cxx11/Makefile | 2 +- .../{nstream-sycl-explicit.cc => nstream-explicit-sycl.cc} | 2 -- Cxx11/nstream-sycl.cc | 2 -- Cxx11/prk_util.h | 7 +++++++ ...anspose-sycl-explicit.cc => transpose-explicit-sycl.cc} | 2 -- Cxx11/transpose-sycl.cc | 2 -- 6 files changed, 8 insertions(+), 9 deletions(-) rename Cxx11/{nstream-sycl-explicit.cc => nstream-explicit-sycl.cc} (99%) rename Cxx11/{transpose-sycl-explicit.cc => transpose-explicit-sycl.cc} (99%) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index d1c945ea6..f96e63744 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -105,7 +105,7 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-sycl-explicit nstream-sycl-explicit +sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-explicit-sycl nstream-explicit-sycl tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ p2p-hyperplane-vector-tbb p2p-tasks-tbb diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-explicit-sycl.cc similarity index 99% rename from Cxx11/nstream-sycl-explicit.cc rename to Cxx11/nstream-explicit-sycl.cc index 2f177db35..6367bf660 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-explicit-sycl.cc @@ -65,8 +65,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 277f9435e..b0fd07be1 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -65,8 +65,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index d2caae1b7..c969af7fd 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -82,6 +82,13 @@ #define PRK_UNUSED #endif +// for SYCL +#ifdef TRISYCL +#define PREBUILD_KERNEL 0 +#else +#define PREBUILD_KERNEL 1 +#endif + namespace prk { int get_alignment(void) diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-explicit-sycl.cc similarity index 99% rename from Cxx11/transpose-sycl-explicit.cc rename to Cxx11/transpose-explicit-sycl.cc index 8b09da622..c34497d97 100644 --- a/Cxx11/transpose-sycl-explicit.cc +++ b/Cxx11/transpose-explicit-sycl.cc @@ -52,8 +52,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - // need to declare kernel class as template // to prevent name mangling conflict below template class iota; diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index a0fa97d00..b853ccf7b 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -52,8 +52,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - // need to declare kernel class as template // to prevent name mangling conflict below template class transpose; From a0d8b21fbca31c78f1ef2a8f06a8de612e82dd6c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 13:56:50 -0700 Subject: [PATCH 193/245] enable optimizations --- common/make.defs.llvm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 224edb8d9..092da96b8 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -61,7 +61,7 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib SYCLDIR=/opt/sycl/latest SYCLCXX=${SYCLDIR}/bin/compute++ SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -SYCLFLAG+=-std=c++14 +SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop # CentOS7 and Ubuntu14 built for this From 4d7092e9008f33488ea7fe9074cb28e4c1a0657e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 14:01:46 -0700 Subject: [PATCH 194/245] make.defs.gcc update (#397) --- common/make.defs.gcc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index 50b7a572a..b0566487a 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -62,6 +62,7 @@ SYCLFLAG=-I$(SYCLDIR)/include #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx #SYCLCXX=${CXX} ${OPENMPFLAG} #SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +METALFLAG=-framework MetalPerformanceShaders # # OCCA # @@ -103,6 +104,19 @@ SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL #SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} SYCLFLAG+=${RANGEFLAG} # +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} +SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} +# ProGTX +# https://github.com/ProGTX/sycl-gtx +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +# # CBLAS for C++ DGEMM # BLASFLAG=-DACCELERATE -framework Accelerate From bc7e338cdf670851ed7a970a2d12d847711f98f4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 14:09:00 -0700 Subject: [PATCH 195/245] add SYCL nstream and transpose with explicit data movement (#395) * triSYCL needs C++17 * fix Julia syntax issue "1./" is a syntax error now. change to "1.0/" * do to SYCL what we have for OpenCL * fix name mangling issue - thanks Rod@CodePlay! * run 32b for all devices unconditionally * label result with precision * hard-code SYCL to CPU execution only due to GPU issues the bandwidth reported is consistent for elements, not bytes, which means that something is wrong. 64b data should not lead to BW that is 2x 32b data... * add host, catch std exception * c++1z instead of c++17 * fix use of ranges in SYCL * correct sycl ranges fix * better example flags * do to SYCL what we have for OpenCL * fix name mangling issue - thanks Rod@CodePlay! * run 32b for all devices unconditionally * label result with precision * hard-code SYCL to CPU execution only due to GPU issues the bandwidth reported is consistent for elements, not bytes, which means that something is wrong. 64b data should not lead to BW that is 2x 32b data... * add host, catch std exception * c++1z instead of c++17 * fix use of ranges in SYCL * correct sycl ranges fix * better example flags * do not incorrectly declare non-read-only buffers as read-only * Sycl multi device and exceptions (#347) * triSYCL needs C++17 * fix Julia syntax issue "1./" is a syntax error now. change to "1.0/" * do to SYCL what we have for OpenCL * fix name mangling issue - thanks Rod@CodePlay! * run 32b for all devices unconditionally * label result with precision * hard-code SYCL to CPU execution only due to GPU issues the bandwidth reported is consistent for elements, not bytes, which means that something is wrong. 64b data should not lead to BW that is 2x 32b data... * add host, catch std exception * c++1z instead of c++17 * fix use of ranges in SYCL * correct sycl ranges fix * better example flags * fix nstream correctness by initializing host vectors * make transpose-sycl multi-device etc * templatize stencil sycl kernel over type * SYCL stencil now templated * try to detect working configs better * forward-declare kernel names in SYCL stencil * fix float template for 2D case * declare kernel name templates closer to usage * OpenCL: add No Device errors (#373) * add No Device errors * errno needs to be included unconditionally * remove Rust from parent makefile to unbreak case when cargo missing * avoid overflow * Cxx11 nstream-kokkos: add missing fences There are fences missing hence you wont' measure what you think on asynchronous backends such as CUDA or HPX. This also fixes using the actual name of the exec space instead of typeid. Example for CUDA on V100: Original: Parallel Research Kernels version 2.16 C++11/Kokkos STREAM triad: A = B + scalar * C Number of iterations = 1 Vector length = 100000000 Offset = 0 Kokkos execution space: N6Kokkos4CudaE Solution validates Rate (MB/s): 422188 Avg time (s): 0.00757957 With fences (and name fix): Parallel Research Kernels version 2.16 C++11/Kokkos STREAM triad: A = B + scalar * C Number of iterations = 1 Vector length = 100000000 Offset = 0 Kokkos execution space: Cuda Solution validates Rate (MB/s): 842600 Avg time (s): 0.00379777 * fix how BLAS linked in Fortran * range-based TBB parallel_for * show but do not enable non-range-based for in RAJA * not yet working prk::vector * fix prk::vector * eliminate rule conflict * switch from std::vector to prk::vector * use prk::vector instead of std::vector * use prk::vector instead of std::vector * use prk::vector instead of std::vector * use prk::vector instead of std::vector * use prk::vector instead of std::vector * better=simpler use of STL * add variant for prk::vector * try to implement prk::vector - works for some impls * add versions that use prk::vector rather than STL * add versions that use prk::vector rather than STL * ignore more stuff * cleanup stencil codegen for vector classes * clean new targets (prk::vector sequential) * silence GCC warning * silence GCC warning * add new impls * reorder loops * fix issues with Thrust when not using NVCC * update examples for Thrust changes * switch Thrust to use PRK range wrapper * work around Clang FE issue * add hyperplane OpenMP to C1z * silent compiler warning * prk::vector impl seems to be working * silence compiler warning * clean example for Intel toolchain * use .data() instead of &([0]) and dynamic schedule loop in DGEMM CBLAS * Flang is mostly Fortran 2008 complete now * add kokkos::fence where appropriate * Update make.defs.llvm default to CodePlay disable OCCA * pointless reordering of string * add PGI support for IVDEP * return value qualified is ignored * TBB does not support PGI * update PGI example flags * fix errors * add hyperplane to make and travis * try to use explicit data movement * fix nstream-sycl but performance still terrible * add optimization flag to example make.defs * add kernel prebuild option and check for fp64 support * improve SYCL transpose 1D and 2D both wrong for order>1295 * partial merge from master * reconcile nstream sycl * merging * reconcile transpose sycl --- Cxx11/nstream-sycl-explicit.cc | 337 +++++++++++++++++++++++++++++++ Cxx11/nstream-sycl.cc | 2 + Cxx11/transpose-sycl-explicit.cc | 299 +++++++++++++++++++++++++++ Cxx11/transpose-sycl.cc | 2 + 4 files changed, 640 insertions(+) create mode 100644 Cxx11/nstream-sycl-explicit.cc create mode 100644 Cxx11/transpose-sycl-explicit.cc diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc new file mode 100644 index 000000000..2f177db35 --- /dev/null +++ b/Cxx11/nstream-sycl-explicit.cc @@ -0,0 +1,337 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "CL/sycl.hpp" +#include "prk_util.h" + +#define PREBUILD_KERNEL 1 + +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + +// need to declare kernel class as template +// to prevent name mangling conflict below +template class nstream; + +template +void run(cl::sycl::queue & q, int iterations, size_t length) +{ + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time(0); + + const T scalar(3); + + std::vector h_A(length,0); + + try { + +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + + cl::sycl::buffer d_A { cl::sycl::range<1>{length} }; + cl::sycl::buffer d_B { cl::sycl::range<1>{length} }; + cl::sycl::buffer d_C { cl::sycl::range<1>{length} }; + + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(A,(T)0); + }); + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(B,(T)2); + }); + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.fill(C,(T)2); + }); + q.wait(); + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) nstream_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + A[i] += B[i] + scalar * C[i]; + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + nstream_time = prk::wtime() - nstream_time; + + q.submit([&](cl::sycl::handler& h) { + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + h.copy(A,h_A.data()); + }); + q.wait(); + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + return; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } + catch (const char * e) { + std::cout << e << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + T ar(0); + T br(2); + T cr(2); + for (int i=0; i<=iterations; ++i) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + + try { + if (length<100000) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + run(host, iterations, length); + run(host, iterations, length); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? +#endif + if (has_spir) { + run(cpu, iterations, length); + run(cpu, iterations, length); + } + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (1) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } +#endif + } + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; + return 1; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index b0fd07be1..277f9435e 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -65,6 +65,8 @@ #include "CL/sycl.hpp" #include "prk_util.h" +#define PREBUILD_KERNEL 1 + #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc new file mode 100644 index 000000000..8b09da622 --- /dev/null +++ b/Cxx11/transpose-sycl-explicit.cc @@ -0,0 +1,299 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "CL/sycl.hpp" +#include "prk_util.h" + +#define PREBUILD_KERNEL 1 + +// need to declare kernel class as template +// to prevent name mangling conflict below +template class iota; +template class transpose; + +template +void run(cl::sycl::queue & q, int iterations, size_t order) +{ + ////////////////////////////////////////////////////////////////////// + /// Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time(0); + + std::vector h_B(order*order,(T)0); + + try { + +#if PREBUILD_KERNEL + cl::sycl::program kernel(q.get_context()); + kernel.build_with_kernel_type>(); +#endif + +#if USE_2D_INDEXING + cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); + cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); +#else + cl::sycl::buffer d_A { cl::sycl::range<1>{order*order} }; + cl::sycl::buffer d_B { cl::sycl::range<1>{order*order} }; +#endif + + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) { + A[i] = i[0] * order + i[1]; + }); +#else + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + h.parallel_for>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) { + A[i] = i[0]; + }); +#endif + }); + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + h.fill(B,(T)0); + }); + q.wait(); + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) trans_time = prk::wtime(); + + q.submit([&](cl::sycl::handler& h) { + +#if USE_2D_INDEXING + cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { +#if USE_2D_INDEXING + cl::sycl::id<2> ij{it[0],it[1]}; + cl::sycl::id<2> ji{it[1],it[0]}; + B[ij] += A[ji]; + A[ji] += (T)1; +#else + B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; + A[it[1] * order + it[0]] += (T)1; +#endif + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + trans_time = prk::wtime() - trans_time; + + q.submit([&](cl::sycl::handler& h) { +#if USE_2D_INDEXING + cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); +#else + cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); +#endif + h.copy(B,h_B.data()); + }); + q.wait(); + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + return; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const T addit = (iterations+1.) * (iterations/2.); + double abserr(0); + for (size_t i=0; i(ij)*(1.+iterations)+addit; + abserr += std::fabs(h_B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const double epsilon(1.0e-8); + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + double avgtime = trans_time/iterations; + double bytes = (size_t)order * (size_t)order * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + + try { + + if (1) { + cl::sycl::queue host(cl::sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + + run(host, iterations, order); + run(host, iterations, order); + } + + // CPU requires spir64 target + if (1) { + cl::sycl::queue cpu(cl::sycl::cpu_selector{}); +#ifndef TRISYCL + auto device = cpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(cpu, iterations, order); + run(cpu, iterations, order); + } + + // NVIDIA GPU requires ptx64 target and does not work very well + if (0) { + cl::sycl::queue gpu(cl::sycl::gpu_selector{}); +#ifndef TRISYCL + auto device = gpu.get_device(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + auto platform = device.get_platform(); + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; +#endif + + run(gpu, iterations, order); + run(gpu, iterations, order); + } + } + catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + catch (std::exception e) { + std::cout << e.what() << std::endl; + } + + return 0; +} + + diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index b853ccf7b..a0fa97d00 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -52,6 +52,8 @@ #include "CL/sycl.hpp" #include "prk_util.h" +#define PREBUILD_KERNEL 1 + // need to declare kernel class as template // to prevent name mangling conflict below template class transpose; From 45b005ffbf7fffd76493301de8bec3197486ec68 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 14:55:19 -0700 Subject: [PATCH 196/245] cleanup --- Cxx11/nstream-sycl.cc | 2 -- Cxx11/transpose-sycl.cc | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 277f9435e..b0fd07be1 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -65,8 +65,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index a0fa97d00..b853ccf7b 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -52,8 +52,6 @@ #include "CL/sycl.hpp" #include "prk_util.h" -#define PREBUILD_KERNEL 1 - // need to declare kernel class as template // to prevent name mangling conflict below template class transpose; From ddfdc9c94f15e1111222f073d7dda667cf4bbe1f Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 14:56:07 -0700 Subject: [PATCH 197/245] remove renamed code --- Cxx11/nstream-sycl-explicit.cc | 337 ------------------------------- Cxx11/transpose-sycl-explicit.cc | 299 --------------------------- 2 files changed, 636 deletions(-) delete mode 100644 Cxx11/nstream-sycl-explicit.cc delete mode 100644 Cxx11/transpose-sycl-explicit.cc diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc deleted file mode 100644 index 2f177db35..000000000 --- a/Cxx11/nstream-sycl-explicit.cc +++ /dev/null @@ -1,337 +0,0 @@ -/// -/// Copyright (c) 2017, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: nstream -/// -/// PURPOSE: To compute memory bandwidth when adding a vector of a given -/// number of double precision values to the scalar multiple of -/// another vector of the same length, and storing the result in -/// a third vector. -/// -/// USAGE: The program takes as input the number -/// of iterations to loop over the triad vectors, the length of the -/// vectors, and the offset between vectors -/// -/// <# iterations> -/// -/// The output consists of diagnostics to make sure the -/// algorithm worked, and of timing statistics. -/// -/// NOTES: Bandwidth is determined as the number of words read, plus the -/// number of words written, times the size of the words, divided -/// by the execution time. For a vector length of N, the total -/// number of words read and written is 4*N*sizeof(double). -/// -/// -/// HISTORY: This code is loosely based on the Stream benchmark by John -/// McCalpin, but does not follow all the Stream rules. Hence, -/// reported results should not be associated with Stream in -/// external publications -/// -/// Converted to C++11 by Jeff Hammond, November 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "CL/sycl.hpp" -#include "prk_util.h" - -#define PREBUILD_KERNEL 1 - -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - -// need to declare kernel class as template -// to prevent name mangling conflict below -template class nstream; - -template -void run(cl::sycl::queue & q, int iterations, size_t length) -{ - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// - - double nstream_time(0); - - const T scalar(3); - - std::vector h_A(length,0); - - try { - -#if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); - kernel.build_with_kernel_type>(); -#endif - - cl::sycl::buffer d_A { cl::sycl::range<1>{length} }; - cl::sycl::buffer d_B { cl::sycl::range<1>{length} }; - cl::sycl::buffer d_C { cl::sycl::range<1>{length} }; - - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - h.fill(A,(T)0); - }); - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - h.fill(B,(T)2); - }); - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - h.fill(C,(T)2); - }); - q.wait(); - - for (int iter = 0; iter<=iterations; ++iter) { - - if (iter==1) nstream_time = prk::wtime(); - - q.submit([&](cl::sycl::handler& h) { - - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - - h.parallel_for>( -#if PREBUILD_KERNEL - kernel.get_kernel>(), -#endif - cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { - A[i] += B[i] + scalar * C[i]; - }); - }); - q.wait(); - } - - // Stop timer before buffer+accessor destructors fire, - // since that will move data, and we do not time that - // for other device-oriented programming models. - nstream_time = prk::wtime() - nstream_time; - - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - h.copy(A,h_A.data()); - }); - q.wait(); - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; - return; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - return; - } - catch (const char * e) { - std::cout << e << std::endl; - return; - } - - ////////////////////////////////////////////////////////////////////// - /// Analyze and output results - ////////////////////////////////////////////////////////////////////// - - T ar(0); - T br(2); - T cr(2); - for (int i=0; i<=iterations; ++i) { - ar += br + scalar * cr; - } - - ar *= length; - - double asum(0); - for (size_t i=0; i epsilon) { - std::cout << "Failed Validation on output array\n" - << " Expected checksum: " << ar << "\n" - << " Observed checksum: " << asum << std::endl; - std::cout << "ERROR: solution did not validate" << std::endl; - } else { - std::cout << "Solution validates" << std::endl; - double avgtime = nstream_time/iterations; - double nbytes = 4.0 * length * sizeof(T); - std::cout << 8*sizeof(T) << "B " - << "Rate (MB/s): " << 1.e-6*nbytes/avgtime - << " Avg time (s): " << avgtime << std::endl; - } -} - -int main(int argc, char * argv[]) -{ - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations, offset; - size_t length; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - length = std::atol(argv[2]); - if (length <= 0) { - throw "ERROR: vector length must be positive"; - } - - offset = (argc>3) ? std::atoi(argv[3]) : 0; - if (length <= 0) { - throw "ERROR: offset must be nonnegative"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Vector length = " << length << std::endl; - std::cout << "Offset = " << offset << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Setup SYCL environment - ////////////////////////////////////////////////////////////////////// - -#ifdef USE_OPENCL - prk::opencl::listPlatforms(); -#endif - - try { - if (length<100000) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, length); - run(host, iterations, length); - } else { - std::cout << "Skipping host device since it is too slow for large problems" << std::endl; - } - - // CPU requires spir64 target - if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif - if (has_spir) { - run(cpu, iterations, length); - run(cpu, iterations, length); - } - } - - // NVIDIA GPU requires ptx64 target and does not work very well - if (1) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif - if (!has_fp64) { - std::cout << "SYCL GPU device lacks FP64 support." << std::endl; - } - if (has_spir) { - run(gpu, iterations, length); - if (has_fp64) { - run(gpu, iterations, length); - } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, length); - if (has_fp64) { - run(gpu, iterations, length); - } -#endif - } - } - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; - return 1; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - return 1; - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - return 0; -} - - diff --git a/Cxx11/transpose-sycl-explicit.cc b/Cxx11/transpose-sycl-explicit.cc deleted file mode 100644 index 8b09da622..000000000 --- a/Cxx11/transpose-sycl-explicit.cc +++ /dev/null @@ -1,299 +0,0 @@ -/// -/// Copyright (c) 2013, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: transpose -/// -/// PURPOSE: This program measures the time for the transpose of a -/// column-major stored matrix into a row-major stored matrix. -/// -/// USAGE: Program input is the matrix order and the number of times to -/// repeat the operation: -/// -/// transpose <# iterations> -/// -/// The output consists of diagnostics to make sure the -/// transpose worked and timing statistics. -/// -/// HISTORY: Written by Rob Van der Wijngaart, February 2009. -/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "CL/sycl.hpp" -#include "prk_util.h" - -#define PREBUILD_KERNEL 1 - -// need to declare kernel class as template -// to prevent name mangling conflict below -template class iota; -template class transpose; - -template -void run(cl::sycl::queue & q, int iterations, size_t order) -{ - ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix - ////////////////////////////////////////////////////////////////////// - - double trans_time(0); - - std::vector h_B(order*order,(T)0); - - try { - -#if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); - kernel.build_with_kernel_type>(); -#endif - -#if USE_2D_INDEXING - cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); - cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); -#else - cl::sycl::buffer d_A { cl::sycl::range<1>{order*order} }; - cl::sycl::buffer d_B { cl::sycl::range<1>{order*order} }; -#endif - - q.submit([&](cl::sycl::handler& h) { -#if USE_2D_INDEXING - cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); - h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) { - A[i] = i[0] * order + i[1]; - }); -#else - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); - h.parallel_for>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) { - A[i] = i[0]; - }); -#endif - }); - q.submit([&](cl::sycl::handler& h) { -#if USE_2D_INDEXING - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); -#else - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); -#endif - h.fill(B,(T)0); - }); - q.wait(); - - for (int iter = 0; iter<=iterations; ++iter) { - - if (iter==1) trans_time = prk::wtime(); - - q.submit([&](cl::sycl::handler& h) { - -#if USE_2D_INDEXING - cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); -#else - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); -#endif - - h.parallel_for>( -#if PREBUILD_KERNEL - kernel.get_kernel>(), -#endif - cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { -#if USE_2D_INDEXING - cl::sycl::id<2> ij{it[0],it[1]}; - cl::sycl::id<2> ji{it[1],it[0]}; - B[ij] += A[ji]; - A[ji] += (T)1; -#else - B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; - A[it[1] * order + it[0]] += (T)1; -#endif - }); - }); - q.wait(); - } - - // Stop timer before buffer+accessor destructors fire, - // since that will move data, and we do not time that - // for other device-oriented programming models. - trans_time = prk::wtime() - trans_time; - - q.submit([&](cl::sycl::handler& h) { -#if USE_2D_INDEXING - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); -#else - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); -#endif - h.copy(B,h_B.data()); - }); - q.wait(); - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - return; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - return; - } - - ////////////////////////////////////////////////////////////////////// - /// Analyze and output results - ////////////////////////////////////////////////////////////////////// - - // TODO: replace with std::generate, std::accumulate, or similar - const T addit = (iterations+1.) * (iterations/2.); - double abserr(0); - for (size_t i=0; i(ij)*(1.+iterations)+addit; - abserr += std::fabs(h_B[ji] - reference); - } - } - -#ifdef VERBOSE - std::cout << "Sum of absolute differences: " << abserr << std::endl; -#endif - - const double epsilon(1.0e-8); - if (abserr < epsilon) { - std::cout << "Solution validates" << std::endl; - double avgtime = trans_time/iterations; - double bytes = (size_t)order * (size_t)order * sizeof(T); - std::cout << 8*sizeof(T) << "B " - << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime - << " Avg time (s): " << avgtime << std::endl; - } else { - std::cout << "ERROR: Aggregate squared error " << abserr - << " exceeds threshold " << epsilon << std::endl; - } -} - -int main(int argc, char * argv[]) -{ - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations; - size_t order; - try { - if (argc < 3) { - throw "Usage: <# iterations> "; - } - - // number of times to do the transpose - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - // order of a the matrix - order = std::atoi(argv[2]); - if (order <= 0) { - throw "ERROR: Matrix Order must be greater than 0"; - } else if (order > std::floor(std::sqrt(INT_MAX))) { - throw "ERROR: matrix dimension too large - overflow risk"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Matrix order = " << order << std::endl; - - ////////////////////////////////////////////////////////////////////// - /// Setup SYCL environment - ////////////////////////////////////////////////////////////////////// - - try { - - if (1) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - auto platform = device.get_platform(); - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - - run(host, iterations, order); - run(host, iterations, order); - } - - // CPU requires spir64 target - if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL - auto device = cpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - auto platform = device.get_platform(); - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; -#endif - - run(cpu, iterations, order); - run(cpu, iterations, order); - } - - // NVIDIA GPU requires ptx64 target and does not work very well - if (0) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL - auto device = gpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - auto platform = device.get_platform(); - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; -#endif - - run(gpu, iterations, order); - run(gpu, iterations, order); - } - } - catch (cl::sycl::exception e) { - std::cout << e.what() << std::endl; - } - catch (std::exception e) { - std::cout << e.what() << std::endl; - } - - return 0; -} - - From 781e1e55165f9dacdae54e8963bc65e7538708e6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 15:08:12 -0700 Subject: [PATCH 198/245] homogenize exceptions and such --- Cxx11/nstream-explicit-sycl.cc | 6 ++- Cxx11/nstream-sycl.cc | 6 ++- Cxx11/stencil-sycl.cc | 85 ++++++++++++++++++++++++-------- Cxx11/transpose-explicit-sycl.cc | 80 ++++++++++++++++++++++++------ Cxx11/transpose-sycl.cc | 79 ++++++++++++++++++++++++----- 5 files changed, 205 insertions(+), 51 deletions(-) diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc index 6367bf660..e51b78a28 100644 --- a/Cxx11/nstream-explicit-sycl.cc +++ b/Cxx11/nstream-explicit-sycl.cc @@ -70,8 +70,6 @@ #define USE_OPENCL 1 #endif -// need to declare kernel class as template -// to prevent name mangling conflict below template class nstream; template @@ -146,11 +144,13 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; std::cout << e.get_line_number() << std::endl; std::cout << e.get_description() << std::endl; std::cout << e.get_cl_error_message() << std::endl; std::cout << e.get_cl_code() << std::endl; +#endif return; } catch (std::exception e) { @@ -313,11 +313,13 @@ int main(int argc, char * argv[]) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; std::cout << e.get_line_number() << std::endl; std::cout << e.get_description() << std::endl; std::cout << e.get_cl_error_message() << std::endl; std::cout << e.get_cl_code() << std::endl; +#endif return 1; } catch (std::exception e) { diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index b0fd07be1..f7d42d732 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -70,8 +70,6 @@ #define USE_OPENCL 1 #endif -// need to declare kernel class as template -// to prevent name mangling conflict below template class nstream; template @@ -128,11 +126,13 @@ void run(cl::sycl::queue & q, int iterations, size_t length) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; std::cout << e.get_line_number() << std::endl; std::cout << e.get_description() << std::endl; std::cout << e.get_cl_error_message() << std::endl; std::cout << e.get_cl_code() << std::endl; +#endif return; } catch (std::exception e) { @@ -295,11 +295,13 @@ int main(int argc, char * argv[]) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; std::cout << e.get_line_number() << std::endl; std::cout << e.get_description() << std::endl; std::cout << e.get_cl_error_message() << std::endl; std::cout << e.get_cl_code() << std::endl; +#endif return 1; } catch (std::exception e) { diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 585fe62e9..1d9e34134 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -61,10 +61,14 @@ ////////////////////////////////////////////////////////////////////// #include "CL/sycl.hpp" - #include "prk_util.h" #include "stencil_sycl.hpp" +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + template class init; template class add; @@ -186,23 +190,26 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif return; } catch (std::exception e) { std::cout << e.what() << std::endl; return; } - -#if 0 - for (auto i=0; i() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; #endif @@ -329,14 +340,17 @@ int main(int argc, char * argv[]) cl::sycl::queue cpu(cl::sycl::cpu_selector{}); #ifndef TRISYCL auto device = cpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - - run(cpu, iterations, n, tile_size, star, radius); - run(cpu, iterations, n, tile_size, star, radius); + if (has_spir) { + run(cpu, iterations, n, tile_size, star, radius); + run(cpu, iterations, n, tile_size, star, radius); + } } // NVIDIA GPU requires ptx64 target and does not work very well @@ -344,21 +358,52 @@ int main(int argc, char * argv[]) cl::sycl::queue gpu(cl::sycl::gpu_selector{}); #ifndef TRISYCL auto device = gpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, n, tile_size, star, radius); + if (has_fp64) { + run(gpu, iterations, n, tile_size, star, radius); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, n, tile_size, star, radius); + if (has_fp64) { + run(gpu, iterations, n, tile_size, star, radius); + } #endif - - run(gpu, iterations, n, tile_size, star, radius); - run(gpu, iterations, n, tile_size, star, radius); } } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + return 1; } catch (std::exception e) { std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; } return 0; diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc index c34497d97..cb4e31c05 100644 --- a/Cxx11/transpose-explicit-sycl.cc +++ b/Cxx11/transpose-explicit-sycl.cc @@ -52,8 +52,11 @@ #include "CL/sycl.hpp" #include "prk_util.h" -// need to declare kernel class as template -// to prevent name mangling conflict below +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + template class iota; template class transpose; @@ -61,7 +64,7 @@ template void run(cl::sycl::queue & q, int iterations, size_t order) { ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// double trans_time(0); @@ -156,12 +159,23 @@ void run(cl::sycl::queue & q, int iterations, size_t order) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif return; } catch (std::exception e) { std::cout << e.what() << std::endl; return; } + catch (const char * e) { + std::cout << e << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -239,14 +253,18 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + try { if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL auto device = host.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; #endif @@ -259,14 +277,17 @@ int main(int argc, char * argv[]) cl::sycl::queue cpu(cl::sycl::cpu_selector{}); #ifndef TRISYCL auto device = cpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - - run(cpu, iterations, order); - run(cpu, iterations, order); + if (has_spir) { + run(cpu, iterations, order); + run(cpu, iterations, order); + } } // NVIDIA GPU requires ptx64 target and does not work very well @@ -274,21 +295,52 @@ int main(int argc, char * argv[]) cl::sycl::queue gpu(cl::sycl::gpu_selector{}); #ifndef TRISYCL auto device = gpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, order); + if (has_fp64) { + run(gpu, iterations, order); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, order); + if (has_fp64) { + run(gpu, iterations, order); + } #endif - - run(gpu, iterations, order); - run(gpu, iterations, order); } } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + return 1; } catch (std::exception e) { std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; } return 0; diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index b853ccf7b..5ed7b9805 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -52,15 +52,18 @@ #include "CL/sycl.hpp" #include "prk_util.h" -// need to declare kernel class as template -// to prevent name mangling conflict below +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + template class transpose; template void run(cl::sycl::queue & q, int iterations, size_t order) { ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix + // Allocate space for the input and transpose matrix ////////////////////////////////////////////////////////////////////// double trans_time(0); @@ -122,12 +125,23 @@ void run(cl::sycl::queue & q, int iterations, size_t order) } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif return; } catch (std::exception e) { std::cout << e.what() << std::endl; return; } + catch (const char * e) { + std::cout << e << std::endl; + return; + } ////////////////////////////////////////////////////////////////////// /// Analyze and output results @@ -205,14 +219,18 @@ int main(int argc, char * argv[]) /// Setup SYCL environment ////////////////////////////////////////////////////////////////////// +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + try { if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL auto device = host.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; #endif @@ -225,14 +243,17 @@ int main(int argc, char * argv[]) cl::sycl::queue cpu(cl::sycl::cpu_selector{}); #ifndef TRISYCL auto device = cpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? #endif - - run(cpu, iterations, order); - run(cpu, iterations, order); + if (has_spir) { + run(cpu, iterations, order); + run(cpu, iterations, order); + } } // NVIDIA GPU requires ptx64 target and does not work very well @@ -240,20 +261,52 @@ int main(int argc, char * argv[]) cl::sycl::queue gpu(cl::sycl::gpu_selector{}); #ifndef TRISYCL auto device = gpu.get_device(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - //std::cout << "cl_khr_spir: " << device.has_extension(cl::sycl::string_class("cl_khr_spir")) << std::endl; + bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, order); + if (has_fp64) { + run(gpu, iterations, order); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, order); + if (has_fp64) { + run(gpu, iterations, order); + } #endif - run(gpu, iterations, order); - run(gpu, iterations, order); } } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + return 1; } catch (std::exception e) { std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; } return 0; From f9fd8a4a8dccac8c39cbf6f41d9f01cec34a55c6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 9 May 2019 15:13:46 -0700 Subject: [PATCH 199/245] fix syntax errors --- Cxx11/stencil-sycl.cc | 1 + Cxx11/transpose-explicit-sycl.cc | 3 +-- Cxx11/transpose-sycl.cc | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 1d9e34134..d9fa54ff6 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -384,6 +384,7 @@ int main(int argc, char * argv[]) run(gpu, iterations, n, tile_size, star, radius); } #endif + } } } catch (cl::sycl::exception e) { diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc index cb4e31c05..cedeafd68 100644 --- a/Cxx11/transpose-explicit-sycl.cc +++ b/Cxx11/transpose-explicit-sycl.cc @@ -258,7 +258,6 @@ int main(int argc, char * argv[]) #endif try { - if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -267,7 +266,6 @@ int main(int argc, char * argv[]) std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; #endif - run(host, iterations, order); run(host, iterations, order); } @@ -321,6 +319,7 @@ int main(int argc, char * argv[]) run(gpu, iterations, order); } #endif + } } } catch (cl::sycl::exception e) { diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 5ed7b9805..761fa136d 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -224,7 +224,6 @@ int main(int argc, char * argv[]) #endif try { - if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -233,7 +232,6 @@ int main(int argc, char * argv[]) std::cout << "SYCL Device: " << device.get_info() << std::endl; std::cout << "SYCL Platform: " << platform.get_info() << std::endl; #endif - run(host, iterations, order); run(host, iterations, order); } @@ -287,6 +285,7 @@ int main(int argc, char * argv[]) run(gpu, iterations, order); } #endif + } } } catch (cl::sycl::exception e) { From 24fceb29510bdb252dc157f8c0e33c27df5d97c4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 13 May 2019 09:40:37 -0400 Subject: [PATCH 200/245] build C1z p2p-hyperplane-openmp --- travis/build-run-prk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index 1f40f02a8..dcb85da2a 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -173,7 +173,7 @@ case "$PRK_TARGET" in g*) # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp + ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 From fdbcd4b3ff1c78cec3b94f49b852cdd548023ffb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 13 May 2019 09:43:31 -0400 Subject: [PATCH 201/245] fix or disable all the RAJA stuff (#399) - use range everywhere we needed to - remove some unused code that would not work anymore anyways - simplify some nested loop cases that should be revisited later --- Cxx11/Makefile | 4 +-- Cxx11/generate-cxx-stencil.py | 9 +++---- Cxx11/nstream-raja.cc | 3 ++- Cxx11/nstream-vector-raja.cc | 11 +++++--- Cxx11/p2p-raja.cc | 12 ++++++--- Cxx11/p2p-vector-raja.cc | 8 ++++-- Cxx11/stencil-raja.cc | 13 +++------ Cxx11/stencil-vector-raja.cc | 40 +++++++--------------------- Cxx11/stencil_raja.hpp | 50 +++++++++++++++++++++-------------- 9 files changed, 73 insertions(+), 77 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index f96e63744..1bb8d88ce 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -118,8 +118,8 @@ rangefor: stencil-vector-rangefor transpose-vector-rangefor nstream-vector-range kokkos: stencil-kokkos transpose-kokkos nstream-kokkos -raja: p2p-vector-raja stencil-vector-raja transpose-vector-raja nstream-vector-raja \ - p2p-raja stencil-raja transpose-raja nstream-raja +raja: p2p-vector-raja stencil-vector-raja nstream-vector-raja \ + p2p-raja transpose-raja nstream-raja stencil-raja # transpose-vector-raja cuda: stencil-cuda transpose-cuda nstream-cuda diff --git a/Cxx11/generate-cxx-stencil.py b/Cxx11/generate-cxx-stencil.py index 2f557fe3f..e4f187007 100755 --- a/Cxx11/generate-cxx-stencil.py +++ b/Cxx11/generate-cxx-stencil.py @@ -111,12 +111,9 @@ def codegen(src,pattern,stencil_size,radius,W,model): src.write(' });\n') elif (model=='raja'): src.write('void '+pattern+str(radius)+'(const int n, const int t, std::vector & in, std::vector & out) {\n') - #src.write(' RAJA::forallN>>\n') - #src.write(' ( RAJA::RangeSegment('+str(radius)+',n-'+str(radius)+'),' - # 'RAJA::RangeSegment('+str(radius)+',n-'+str(radius)+'),\n') - #src.write(' [&](RAJA::Index_type i, RAJA::Index_type j) {\n') - src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type i) {\n') - src.write(' RAJA::forall(RAJA::Index_type('+str(radius)+'), RAJA::Index_type(n-'+str(radius)+'), [&](RAJA::Index_type j) {\n') + src.write(' RAJA::RangeSegment inside('+str(radius)+',n-'+str(radius)+');\n') + src.write(' RAJA::forall(inside, [&](RAJA::Index_type i) {\n') + src.write(' RAJA::forall(inside, [&](RAJA::Index_type j) {\n') bodygen(src,pattern,stencil_size,radius,W,model) src.write(' });\n') src.write(' });\n') diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc index ef7b6c08e..dcba4cbf2 100644 --- a/Cxx11/nstream-raja.cc +++ b/Cxx11/nstream-raja.cc @@ -166,7 +166,8 @@ int main(int argc, char * argv[]) ar *= length; RAJA::ReduceSum reduced_asum(0.0); - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + RAJA::forall(range, [=](RAJA::Index_type i) { reduced_asum += std::fabs(A(i)); }); double asum(reduced_asum); diff --git a/Cxx11/nstream-vector-raja.cc b/Cxx11/nstream-vector-raja.cc index ee3986e50..8db807cc4 100644 --- a/Cxx11/nstream-vector-raja.cc +++ b/Cxx11/nstream-vector-raja.cc @@ -124,10 +124,13 @@ int main(int argc, char * argv[]) std::vector B(length); std::vector C(length); + RAJA::RangeSegment range(0, length); + double scalar(3); { - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + RAJA::forall(range, [&](RAJA::Index_type i) { A[i] = 0.0; B[i] = 2.0; C[i] = 2.0; @@ -137,7 +140,8 @@ int main(int argc, char * argv[]) if (iter==1) nstream_time = prk::wtime(); - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + RAJA::forall(range, [&](RAJA::Index_type i) { A[i] += B[i] + scalar * C[i]; }); } @@ -158,7 +162,8 @@ int main(int argc, char * argv[]) ar *= length; RAJA::ReduceSum reduced_asum(0.0); - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(length), [&](RAJA::Index_type i) { + RAJA::forall(range, [=](RAJA::Index_type i) { reduced_asum += std::fabs(A[i]); }); double asum(reduced_asum); diff --git a/Cxx11/p2p-raja.cc b/Cxx11/p2p-raja.cc index 202d9b6b6..2900d3fbc 100644 --- a/Cxx11/p2p-raja.cc +++ b/Cxx11/p2p-raja.cc @@ -121,10 +121,14 @@ int main(int argc, char* argv[]) double * RESTRICT Amem = new double[m*n]; matrix grid(Amem, m, n); - for (int i=0; i(range, [=](RAJA::Index_type i) { + //for (int i=0; i(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + RAJA::RangeSegment range(1, j+1); + RAJA::forall(range, [&](RAJA::Index_type i) { auto x = i; auto y = j-i+1; grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; }); } for (auto j=n-2; j>=1; j--) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + //RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(j+1), [&](RAJA::Index_type i) { + RAJA::RangeSegment range(1, j+1); + RAJA::forall(range, [&](RAJA::Index_type i) { auto x = n+i-j-1; auto y = n-i; grid[x*n+y] = grid[(x-1)*n+y] + grid[x*n+(y-1)] - grid[(x-1)*n+(y-1)]; diff --git a/Cxx11/stencil-raja.cc b/Cxx11/stencil-raja.cc index 5fa333bce..e52638c8b 100644 --- a/Cxx11/stencil-raja.cc +++ b/Cxx11/stencil-raja.cc @@ -207,17 +207,12 @@ int main(int argc, char* argv[]) size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); // compute L1 norm in parallel -#if 0 - // This leads to incorrect computation of the norm. - RAJA::ReduceSum reduced_norm(0.0); - RAJA::forallN>> -#else + RAJA::RangeSegment inside(radius,n-radius); RAJA::ReduceSum reduced_norm(0.0); - RAJA::forallN>> -#endif - ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius), - [&](RAJA::Index_type i, RAJA::Index_type j) { + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { reduced_norm += std::fabs(out(i,j)); + }); }); double norm = reduced_norm / active_points; diff --git a/Cxx11/stencil-vector-raja.cc b/Cxx11/stencil-vector-raja.cc index 822a45c00..ce4a7278a 100644 --- a/Cxx11/stencil-vector-raja.cc +++ b/Cxx11/stencil-vector-raja.cc @@ -168,21 +168,14 @@ int main(int argc, char* argv[]) std::vector in(n*n); std::vector out(n*n); -#if 0 - RAJA::forallN>> - ( RAJA::RangeSegment(0, n), RAJA::RangeSegment(0, n), - [&](RAJA::Index_type i, RAJA::Index_type j) { - in[i*n+j] = static_cast(i+j); - out[i*n+j] = 0.0; - }); -#else - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type j) { + RAJA::RangeSegment range(0, n); + + RAJA::forall(range, [&](RAJA::Index_type i) { + RAJA::forall(range, [&](RAJA::Index_type j) { in[i*n+j] = static_cast(i+j); out[i*n+j] = 0.0; }); }); -#endif for (auto iter = 0; iter<=iterations; iter++) { @@ -190,19 +183,11 @@ int main(int argc, char* argv[]) // Apply the stencil operator stencil(n, tile_size, in, out); // Add constant to solution to force refresh of neighbor data, if any -#if 0 - RAJA::forallN>> - ( RAJA::RangeSegment(0, n), RAJA::RangeSegment(0, n), - [&](RAJA::Index_type i, RAJA::Index_type j) { - in[i*n+j] += 1.0; - }); -#else - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(0), RAJA::Index_type(n), [&](RAJA::Index_type j) { + RAJA::forall(range, [&](RAJA::Index_type i) { + RAJA::forall(range, [&](RAJA::Index_type j) { in[i*n+j] += 1.0; }); }); -#endif } stencil_time = prk::wtime() - stencil_time; @@ -215,17 +200,12 @@ int main(int argc, char* argv[]) size_t active_points = static_cast(n-2*radius)*static_cast(n-2*radius); // compute L1 norm in parallel -#if 0 - // This leads to incorrect computation of the norm. - RAJA::ReduceSum reduced_norm(0.0); - RAJA::forallN>> -#else + RAJA::RangeSegment inside(radius,n-radius); RAJA::ReduceSum reduced_norm(0.0); - RAJA::forallN>> -#endif - ( RAJA::RangeSegment(radius,n-radius), RAJA::RangeSegment(radius,n-radius), - [&](RAJA::Index_type i, RAJA::Index_type j) { + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { reduced_norm += std::fabs(out[i*n+j]); + }); }); double norm = reduced_norm / active_points; diff --git a/Cxx11/stencil_raja.hpp b/Cxx11/stencil_raja.hpp index ebd2d28b1..d6d912a1d 100644 --- a/Cxx11/stencil_raja.hpp +++ b/Cxx11/stencil_raja.hpp @@ -1,6 +1,7 @@ void star1(const int n, const int t, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(1,n-1); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i)*n+(j-1)] * -0.5 +in[(i-1)*n+(j)] * -0.5 +in[(i+1)*n+(j)] * 0.5 @@ -10,8 +11,9 @@ void star1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(2,n-2); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i)*n+(j-2)] * -0.125 +in[(i)*n+(j-1)] * -0.25 +in[(i-2)*n+(j)] * -0.125 @@ -25,8 +27,9 @@ void star2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(3,n-3); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i)*n+(j-3)] * -0.0555555555556 +in[(i)*n+(j-2)] * -0.0833333333333 +in[(i)*n+(j-1)] * -0.166666666667 @@ -44,8 +47,9 @@ void star3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(4,n-4); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i)*n+(j-4)] * -0.03125 +in[(i)*n+(j-3)] * -0.0416666666667 +in[(i)*n+(j-2)] * -0.0625 @@ -67,8 +71,9 @@ void star4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(5,n-5); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i)*n+(j-5)] * -0.02 +in[(i)*n+(j-4)] * -0.025 +in[(i)*n+(j-3)] * -0.0333333333333 @@ -94,8 +99,9 @@ void star5(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(1), RAJA::Index_type(n-1), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(1,n-1); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i-1)*n+(j-1)] * -0.25 +in[(i)*n+(j-1)] * -0.25 +in[(i-1)*n+(j)] * -0.25 @@ -108,8 +114,9 @@ void grid1(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(2), RAJA::Index_type(n-2), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(2,n-2); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i-2)*n+(j-2)] * -0.0625 +in[(i-1)*n+(j-2)] * -0.0208333333333 +in[(i)*n+(j-2)] * -0.0208333333333 @@ -136,8 +143,9 @@ void grid2(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(3), RAJA::Index_type(n-3), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(3,n-3); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i-3)*n+(j-3)] * -0.0277777777778 +in[(i-2)*n+(j-3)] * -0.00555555555556 +in[(i-1)*n+(j-3)] * -0.00555555555556 @@ -186,8 +194,9 @@ void grid3(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(4), RAJA::Index_type(n-4), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(4,n-4); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i-4)*n+(j-4)] * -0.015625 +in[(i-3)*n+(j-4)] * -0.00223214285714 +in[(i-2)*n+(j-4)] * -0.00223214285714 @@ -266,8 +275,9 @@ void grid4(const int n, const int t, std::vector & in, std::vector & in, std::vector & out) { - RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type i) { - RAJA::forall(RAJA::Index_type(5), RAJA::Index_type(n-5), [&](RAJA::Index_type j) { + RAJA::RangeSegment inside(5,n-5); + RAJA::forall(inside, [&](RAJA::Index_type i) { + RAJA::forall(inside, [&](RAJA::Index_type j) { out[i*n+j] += +in[(i-5)*n+(j-5)] * -0.01 +in[(i-4)*n+(j-5)] * -0.00111111111111 +in[(i-3)*n+(j-5)] * -0.00111111111111 From 5f4b5d68d288de14eefa943eebd6311f9136153b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 12 May 2019 18:36:04 -0700 Subject: [PATCH 202/245] switch to GCC 9 w/ its PSTL --- common/make.defs.gcc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/common/make.defs.gcc b/common/make.defs.gcc index b0566487a..f4552bd87 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -4,7 +4,7 @@ # # Base compilers and language options # -VERSION=-8 +VERSION=-9 # C99 is required in some implementations. CC=gcc${VERSION} -std=c11 -pthread #EXTRA_CLIBS=-lrt @@ -74,15 +74,16 @@ METALFLAG=-framework MetalPerformanceShaders # # TBB # -TBBDIR=/usr/local/Cellar/tbb/2019_U3_1 -TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +TBBDIR=/usr/local/Cellar/tbb/2019_U5_1 +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.68.0_1/include -#RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} -RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include -PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} +#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc @@ -95,7 +96,7 @@ THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} +SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG} SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL # ProGTX # https://github.com/ProGTX/sycl-gtx From 30f5a22395cdb819eeb28ea949de4600818d22a3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 12 May 2019 18:36:41 -0700 Subject: [PATCH 203/245] move valarray header to the only place it is needed --- Cxx11/nstream-valarray.cc | 1 + Cxx11/prk_util.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/nstream-valarray.cc b/Cxx11/nstream-valarray.cc index 656f69f9c..bcc6361ac 100644 --- a/Cxx11/nstream-valarray.cc +++ b/Cxx11/nstream-valarray.cc @@ -63,6 +63,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include int main(int argc, char * argv[]) { diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index c969af7fd..d09c16c3b 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -54,7 +54,7 @@ #include #include #include -#include +//#include #include #include From fc349a1d110474b9f9092076383c7438a472619e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 12 May 2019 18:44:53 -0700 Subject: [PATCH 204/245] PSTL in GCC9 - support it --- Cxx11/nstream-vector-pstl.cc | 6 +++--- Cxx11/prk_pstl.h | 14 ++++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Cxx11/nstream-vector-pstl.cc b/Cxx11/nstream-vector-pstl.cc index 21b5e0b45..b69a8171c 100644 --- a/Cxx11/nstream-vector-pstl.cc +++ b/Cxx11/nstream-vector-pstl.cc @@ -126,7 +126,7 @@ int main(int argc, char * argv[]) double scalar(3); { -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) @@ -140,11 +140,11 @@ int main(int argc, char * argv[]) C[i] = 2; }); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) nstream_time = prk::wtime(); -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::for_each( exec::par_unseq, std::begin(range), std::end(range), [&] (size_t i) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) diff --git a/Cxx11/prk_pstl.h b/Cxx11/prk_pstl.h index 11e0368bb..97efca244 100644 --- a/Cxx11/prk_pstl.h +++ b/Cxx11/prk_pstl.h @@ -36,18 +36,24 @@ #define USE_INTEL_PSTL #endif -#ifdef USE_PSTL -# ifdef USE_INTEL_PSTL +#if defined(USE_PSTL) +# if defined(__GNUC__) && (__GNUC__ >= 9) +# include +# include +# include +//# include +namespace exec = __pstl::execution; +# elif defined(USE_INTEL_PSTL) # include # include # include -# include +//# include +namespace exec = std::execution; # elif defined(__GNUC__) && defined(__GNUC_MINOR__) && \ ( (__GNUC__ >= 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) # include # include # endif -namespace exec = std::execution; #endif #endif /* PRK_PSTL_H */ From 7dee91f52205b27e70864b605b2d46209b0fcca4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 12 May 2019 18:50:23 -0700 Subject: [PATCH 205/245] PSTL in GCC9 - support it --- Cxx11/transpose-vector-pstl.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Cxx11/transpose-vector-pstl.cc b/Cxx11/transpose-vector-pstl.cc index ac7aefb8a..4b5839647 100644 --- a/Cxx11/transpose-vector-pstl.cc +++ b/Cxx11/transpose-vector-pstl.cc @@ -108,14 +108,14 @@ int main(int argc, char * argv[]) auto range = prk::range(0,order); - auto trans_time = 0.0; + double trans_time(0); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = prk::wtime(); // transpose -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ From d152d043a6b6ae567b98c018baf58e381813d128 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 12 May 2019 18:56:24 -0700 Subject: [PATCH 206/245] add GCC-9 to supported PSTL configs remove non-transform dead code in stencil stop using auto for iter and timer - this is pointless --- Cxx11/p2p-hyperplane-vector-pstl.cc | 6 +++--- Cxx11/stencil-vector-pstl.cc | 27 +++++---------------------- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/Cxx11/p2p-hyperplane-vector-pstl.cc b/Cxx11/p2p-hyperplane-vector-pstl.cc index c64757e6d..9b02a65a0 100644 --- a/Cxx11/p2p-hyperplane-vector-pstl.cc +++ b/Cxx11/p2p-hyperplane-vector-pstl.cc @@ -119,7 +119,7 @@ int main(int argc, char* argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto pipeline_time = 0.0; // silence compiler warning + double pipeline_time(0); std::vector grid(n*n,0.0); @@ -129,7 +129,7 @@ int main(int argc, char* argv[]) grid[j*n+0] = static_cast(j); } - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) pipeline_time = prk::wtime(); @@ -156,7 +156,7 @@ int main(int argc, char* argv[]) const auto begin = std::max(2,i-(nb+1)+2); const auto end = std::min(i,nb+1)+1; auto range = prk::range(begin,end); -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::for_each( exec::par, std::begin(range), std::end(range), [&] (auto j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) diff --git a/Cxx11/stencil-vector-pstl.cc b/Cxx11/stencil-vector-pstl.cc index ca3c83ec0..cae97761d 100644 --- a/Cxx11/stencil-vector-pstl.cc +++ b/Cxx11/stencil-vector-pstl.cc @@ -63,7 +63,7 @@ #include "prk_util.h" #include "prk_pstl.h" // See ParallelSTL.md for important information. -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) #include "stencil_pstl.hpp" #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) @@ -175,14 +175,14 @@ int main(int argc, char* argv[]) // Allocate space and perform the computation ////////////////////////////////////////////////////////////////////// - auto stencil_time = 0.0; + double stencil_time(0); std::vector in(n*n); std::vector out(n*n); // initialize the input and output arrays auto range = prk::range(0,n); -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ @@ -198,35 +198,18 @@ int main(int argc, char* argv[]) }); }); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) stencil_time = prk::wtime(); // Apply the stencil operator stencil(n, tile_size, in, out); // Add constant to solution to force refresh of neighbor data, if any -#if 0 -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) - std::for_each( exec::par, std::begin(range), std::end(range), [&] (int i) { - std::for_each( exec::unseq, std::begin(range), std::end(range), [&] (int j) { -#elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ - && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) - __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int i) { - __gnu_parallel::for_each( std::begin(range), std::end(range), [&] (int j) { -#else - std::for_each( std::begin(range), std::end(range), [&] (int i) { - std::for_each( std::begin(range), std::end(range), [&] (int j) { -#endif - in[i*n+j] += 1.0; - }); - }); -#else -#if defined(USE_PSTL) && defined(USE_INTEL_PSTL) +#if defined(USE_PSTL) && ( defined(USE_INTEL_PSTL) || ( defined(__GNUC__) && (__GNUC__ >= 9) ) ) std::transform( exec::par_unseq, in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); #elif defined(USE_PSTL) && defined(__GNUC__) && defined(__GNUC_MINOR__) \ && ( (__GNUC__ == 8) || (__GNUC__ == 7) && (__GNUC_MINOR__ >= 2) ) __gnu_parallel::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); #else std::transform( in.begin(), in.end(), in.begin(), [](double c) { return c+=1.0; }); -#endif #endif } From 54dbbfbe358d61d20a71854846826b353760dd83 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 13 May 2019 09:39:04 -0400 Subject: [PATCH 207/245] add valarray include to transpose instance --- Cxx11/transpose-valarray.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/Cxx11/transpose-valarray.cc b/Cxx11/transpose-valarray.cc index 029f893be..55420723b 100644 --- a/Cxx11/transpose-valarray.cc +++ b/Cxx11/transpose-valarray.cc @@ -53,6 +53,7 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include int main(int argc, char * argv[]) { From 7bfa597b76a65be9dad9550f1451543a3fbcf2a4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 14 May 2019 10:22:58 -0400 Subject: [PATCH 208/245] reuse OpenMP for sequential versions of nstream and transpose --- C1z/Makefile | 9 +- C1z/{nstream.c => nstream-openmp.c} | 0 C1z/transpose.c | 176 ---------------------------- 3 files changed, 6 insertions(+), 179 deletions(-) rename C1z/{nstream.c => nstream-openmp.c} (100%) delete mode 100644 C1z/transpose.c diff --git a/C1z/Makefile b/C1z/Makefile index 535ed9eaa..399751925 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -71,6 +71,12 @@ p2p-innerloop: p2p-innerloop-openmp.c prk_util.h p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ +nstream: nstream-openmp.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + +transpose: transpose-openmp.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + %-mpi: %-mpi.c prk_util.h $(MPICC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ @@ -92,9 +98,6 @@ p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h %-taskloop: %-taskloop.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ -nstream-openmp: nstream.c prk_util.h - $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ - %-openmp: %-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(OMPFLAGS) $(EXTRA_CLIBS) -o $@ diff --git a/C1z/nstream.c b/C1z/nstream-openmp.c similarity index 100% rename from C1z/nstream.c rename to C1z/nstream-openmp.c diff --git a/C1z/transpose.c b/C1z/transpose.c deleted file mode 100644 index 797c2395f..000000000 --- a/C1z/transpose.c +++ /dev/null @@ -1,176 +0,0 @@ -/// -/// Copyright (c) 2013, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: transpose -/// -/// PURPOSE: This program measures the time for the transpose of a -/// column-major stored matrix into a row-major stored matrix. -/// -/// USAGE: Program input is the matrix order and the number of times to -/// repeat the operation: -/// -/// transpose <# iterations> [tile size] -/// -/// An optional parameter specifies the tile size used to divide the -/// individual matrix blocks for improved cache and TLB performance. -/// -/// The output consists of diagnostics to make sure the -/// transpose worked and timing statistics. -/// -/// HISTORY: Written by Rob Van der Wijngaart, February 2009. -/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. -/// C11-ification by Jeff Hammond, June 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "prk_util.h" - -int main(int argc, char * argv[]) -{ - printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); - printf("C11 Matrix transpose: B = A^T\n"); - - ////////////////////////////////////////////////////////////////////// - /// Read and test input parameters - ////////////////////////////////////////////////////////////////////// - - if (argc < 3) { - printf("Usage: <# iterations> [tile size]\n"); - return 1; - } - - // number of times to do the transpose - int iterations = atoi(argv[1]); - if (iterations < 1) { - printf("ERROR: iterations must be >= 1\n"); - return 1; - } - - // order of a the matrix - int order = atoi(argv[2]); - if (order <= 0) { - printf("ERROR: Matrix Order must be greater than 0\n"); - return 1; - } - - // default tile size for tiling of local transpose - int tile_size = (argc>4) ? atoi(argv[3]) : 32; - // a negative tile size means no tiling of the local transpose - if (tile_size <= 0) tile_size = order; - - printf("Number of iterations = %d\n", iterations); - printf("Matrix order = %d\n", order); - printf("Tile size = %d\n", tile_size); - - ////////////////////////////////////////////////////////////////////// - /// Allocate space for the input and transpose matrix - ////////////////////////////////////////////////////////////////////// - - double trans_time = 0.0; - - size_t bytes = order*order*sizeof(double); - double * restrict A = prk_malloc(bytes); - double * restrict B = prk_malloc(bytes); - - { - for (int i=0;i Date: Tue, 14 May 2019 10:37:56 -0400 Subject: [PATCH 209/245] remove innerloop in favor of hyperplane and cleanup CI --- .gitignore | 8 +- C1z/Makefile | 12 +- C1z/p2p-innerloop-openmp.c | 179 ----------------------------- C1z/stencil.c | 230 ------------------------------------- travis/build-run-prk.sh | 57 ++++----- 5 files changed, 34 insertions(+), 452 deletions(-) delete mode 100644 C1z/p2p-innerloop-openmp.c delete mode 100644 C1z/stencil.c diff --git a/.gitignore b/.gitignore index a7e76eb32..2c32fc0c8 100644 --- a/.gitignore +++ b/.gitignore @@ -108,8 +108,12 @@ C1z/nstream-memkind-openmp C1z/nstream-mmap C1z/nstream-mmap-openmp C1z/p2p +C1z/p2p-avx +C1z/p2p-sse C1z/p2p-innerloop C1z/p2p-innerloop-openmp +C1z/p2p-hyperplane +C1z/p2p-hyperplane-openmp C1z/p2p-tasks-openmp C1z/p2p-simd-openmp C1z/stencil @@ -280,7 +284,3 @@ FORTRAN/transpose-ornlacc RUST/p2p/Cargo.lock RUST/stencil/Cargo.lock RUST/transpose/Cargo.lock -nstream -../C1z/p2p-avx -../C1z/p2p-sse -../C1z/p2p-hyperplane-openmp diff --git a/C1z/Makefile b/C1z/Makefile index 399751925..23562f35f 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -45,11 +45,11 @@ endif all: serial thread openmp taskloop $(EXTRA) -serial: nstream p2p p2p-innerloop p2p-hyperplane stencil transpose +serial: nstream p2p p2p-hyperplane stencil transpose thread: transpose-thread -openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp +openmp: nstream-openmp p2p-simd-openmp p2p-tasks-openmp stencil-openmp transpose-openmp p2p-hyperplane-openmp mpi: nstream-mpi @@ -65,15 +65,15 @@ cilk: stencil-cilk transpose-cilk ispc: transpose-ispc -p2p-innerloop: p2p-innerloop-openmp.c prk_util.h - $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ - p2p-hyperplane: p2p-hyperplane-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ nstream: nstream-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ +stencil: stencil-openmp.c prk_util.h + $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ + transpose: transpose-openmp.c prk_util.h $(CC) $(CFLAGS) $< $(EXTRA_CLIBS) -o $@ @@ -126,7 +126,7 @@ clean: -rm -f *.optrpt -rm -f *.dwarf -rm -rf *.dSYM # Mac - -rm -f nstream p2p p2p-innerloop stencil transpose + -rm -f nstream p2p p2p-hyperplane stencil transpose -rm -f *-openmp -rm -f *-mpi -rm -f *-target diff --git a/C1z/p2p-innerloop-openmp.c b/C1z/p2p-innerloop-openmp.c deleted file mode 100644 index 35fe80cba..000000000 --- a/C1z/p2p-innerloop-openmp.c +++ /dev/null @@ -1,179 +0,0 @@ -/// -/// Copyright (c) 2013, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: Pipeline -/// -/// PURPOSE: This program tests the efficiency with which point-to-point -/// synchronization can be carried out. It does so by executing -/// a pipelined algorithm on an n^2 grid. The first array dimension -/// is distributed among the threads (stripwise decomposition). -/// -/// USAGE: The program takes as input the -/// dimensions of the grid, and the number of iterations on the grid -/// -/// -/// -/// The output consists of diagnostics to make sure the -/// algorithm worked, and of timing statistics. -/// -/// FUNCTIONS CALLED: -/// -/// Other than standard C functions, the following -/// functions are used in this program: -/// -/// wtime() -/// -/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. -/// - C99-ification by Jeff Hammond, February 2016. -/// - C11-ification by Jeff Hammond, June 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "prk_util.h" - -int main(int argc, char* argv[]) -{ - printf("Parallel Research Kernels version %.2f\n", PRKVERSION); -#ifdef _OPENMP - printf("C11/OpenMP INNERLOOP pipeline execution on 2D grid\n"); -#else - printf("C11/Serial INNERLOOP pipeline execution on 2D grid\n"); -#endif - - ////////////////////////////////////////////////////////////////////// - // Process and test input parameters - ////////////////////////////////////////////////////////////////////// - - if (argc < 3) { - printf("Usage: <# iterations> \n"); - return 1; - } - - // number of times to run the pipeline algorithm - int iterations = atoi(argv[1]); - if (iterations < 1) { - printf("ERROR: iterations must be >= 1\n"); - return 1; - } - - // grid dimensions - int n = atol(argv[2]); - if (n < 1) { - printf("ERROR: grid dimension must be positive: %d\n", n); - return 1; - } - -#ifdef _OPENMP - printf("Number of threads (max) = %d\n", omp_get_max_threads()); -#endif - printf("Number of iterations = %d\n", iterations); - printf("Grid sizes = %d,%d\n", n, n); - - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// - - double pipeline_time = 0.0; // silence compiler warning - - size_t bytes = n*n*sizeof(double); - double * restrict grid = prk_malloc(bytes); - - OMP_PARALLEL() - { - OMP_FOR() - for (int i=0; i epsilon) { - printf("ERROR: checksum %lf does not match verification value %lf\n", grid[(n-1)*n+(n-1)], corner_val); - return 1; - } - - prk_free(grid); - -#ifdef VERBOSE - printf("Solution validates; verification value = %lf\n", corner_val ); -#else - printf("Solution validates\n" ); -#endif - double avgtime = pipeline_time/iterations; - printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 2.0e-6 * ( (n-1)*(n-1) )/avgtime, avgtime ); - - return 0; -} diff --git a/C1z/stencil.c b/C1z/stencil.c deleted file mode 100644 index 50ff8cbaa..000000000 --- a/C1z/stencil.c +++ /dev/null @@ -1,230 +0,0 @@ - -/// -/// Copyright (c) 2013, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: Stencil -/// -/// PURPOSE: This program tests the efficiency with which a space-invariant, -/// linear, symmetric filter (stencil) can be applied to a square -/// grid or image. -/// -/// USAGE: The program takes as input the linear -/// dimension of the grid, and the number of iterations on the grid -/// -/// -/// -/// The output consists of diagnostics to make sure the -/// algorithm worked, and of timing statistics. -/// -/// FUNCTIONS CALLED: -/// -/// Other than standard C functions, the following -/// functions are used in this program: -/// -/// wtime() -/// -/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. -/// - C99-ification by Jeff Hammond, February 2016. -/// - C11-ification by Jeff Hammond, June 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "prk_util.h" - -typedef void (*stencil_t)(const int, const double * restrict, double * restrict); - -void nothing(const int n, const double * restrict in, double * restrict out) -{ - printf("You are trying to use a stencil that does not exist.\n"); - printf("Please generate the new stencil using the code generator.\n"); - // n will never be zero - this is to silence compiler warnings. - if (n==0) printf("%p %p\n", in, out); - abort(); -} - -#include "stencil_seq.h" - -int main(int argc, char * argv[]) -{ - printf("Parallel Research Kernels version %.2f\n", PRKVERSION); - printf("C11 Stencil execution on 2D grid\n"); - - ////////////////////////////////////////////////////////////////////// - // Process and test input parameters - ////////////////////////////////////////////////////////////////////// - - if (argc < 3){ - printf("Usage: <# iterations> [ ]\n"); - return 1; - } - - // number of times to run the algorithm - int iterations = atoi(argv[1]); - if (iterations < 1) { - printf("ERROR: iterations must be >= 1\n"); - return 1; - } - - // linear grid dimension - int n = atoi(argv[2]); - if (n < 1) { - printf("ERROR: grid dimension must be positive\n"); - return 1; - } else if (n > floor(sqrt(INT_MAX))) { - printf("ERROR: grid dimension too large - overflow risk\n"); - return 1; - } - - // stencil pattern - bool star = true; - if (argc > 3) { - char* pattern = argv[3]; - star = (0==strncmp(pattern,"star",4)) ? true : false; - } - - // stencil radius - int radius = 2; - if (argc > 4) { - radius = atoi(argv[4]); - } - - if ( (radius < 1) || (2*radius+1 > n) ) { - printf("ERROR: Stencil radius negative or too large\n"); - return 1; - } - - printf("Number of iterations = %d\n", iterations); - printf("Grid sizes = %d\n", n); - printf("Type of stencil = %s\n", (star ? "star" : "grid") ); - printf("Radius of stencil = %d\n", radius ); - - stencil_t stencil = nothing; - if (star) { - switch (radius) { - case 1: stencil = star1; break; - case 2: stencil = star2; break; - case 3: stencil = star3; break; - case 4: stencil = star4; break; - case 5: stencil = star5; break; - case 6: stencil = star6; break; - case 7: stencil = star7; break; - case 8: stencil = star8; break; - case 9: stencil = star9; break; - } - } else { - switch (radius) { - case 1: stencil = grid1; break; - case 2: stencil = grid2; break; - case 3: stencil = grid3; break; - case 4: stencil = grid4; break; - case 5: stencil = grid5; break; - case 6: stencil = grid6; break; - case 7: stencil = grid7; break; - case 8: stencil = grid8; break; - case 9: stencil = grid9; break; - } - } - - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// - - double stencil_time = 0.0; - - // interior of grid with respect to stencil - size_t active_points = (n-2*radius)*(n-2*radius); - size_t bytes = n*n*sizeof(double); - - double * restrict in = prk_malloc(bytes); - double * restrict out = prk_malloc(bytes); - - { - for (int i=0; i epsilon) { - printf("ERROR: L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); - return 1; - } else { - printf("Solution validates\n"); -#ifdef VERBOSE - printf("L1 norm = %lf Reference L1 norm = %lf\n", norm, reference_norm); -#endif - const int stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); - size_t flops = (2*stencil_size+1) * active_points; - double avgtime = stencil_time/iterations; - printf("Rate (MFlops/s): %lf Avg time (s): %lf\n", 1.0e-6 * (double)flops/avgtime, avgtime ); - } - - return 0; -} diff --git a/travis/build-run-prk.sh b/travis/build-run-prk.sh index dcb85da2a..0bb1f92d0 100755 --- a/travis/build-run-prk.sh +++ b/travis/build-run-prk.sh @@ -148,10 +148,10 @@ case "$PRK_TARGET" in echo "EXTRA_CLIBS=-lm -lpthread" >> common/make.defs # C11 without external parallelism - ${MAKE} -C $PRK_TARGET_PATH p2p stencil transpose p2p-innerloop p2p-hyperplane + ${MAKE} -C $PRK_TARGET_PATH nstream p2p stencil transpose p2p-hyperplane + $PRK_TARGET_PATH/nstream 10 16777216 32 $PRK_TARGET_PATH/p2p 10 1024 1024 $PRK_TARGET_PATH/p2p 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-innerloop 10 1024 $PRK_TARGET_PATH/p2p-hyperplane 10 1024 $PRK_TARGET_PATH/p2p-hyperplane 10 1024 32 $PRK_TARGET_PATH/stencil 10 1000 @@ -170,12 +170,15 @@ case "$PRK_TARGET" in # C11 with OpenMP export OMP_NUM_THREADS=2 case "$CC" in + clang*) + echo "Skipping Clang since OpenMP support probably missing" + ;; g*) # Host echo "OPENMPFLAG=-fopenmp" >> common/make.defs - ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp + ${MAKE} -C $PRK_TARGET_PATH nstream-openmp p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp + $PRK_TARGET_PATH/nstream-openmp 10 16777216 32 $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 32 $PRK_TARGET_PATH/stencil-openmp 10 1000 @@ -198,27 +201,14 @@ case "$PRK_TARGET" in done done ;; - clang*) - # Host - echo "Skipping Clang since OpenMP support probably missing" - #echo "OPENMPFLAG=-fopenmp" >> common/make.defs - #${MAKE} -C $PRK_TARGET_PATH openmp - #$PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - #$PRK_TARGET_PATH/stencil-openmp 10 1000 - #$PRK_TARGET_PATH/transpose-penmp 10 1024 32 - #echo "Test stencil code generator" - #for s in star grid ; do - # for r in 1 2 3 4 5 ; do - # $PRK_TARGET_PATH/stencil-penmp 10 200 $s $r - # done - #done - ;; ic*) # Host echo "OPENMPFLAG=-qopenmp" >> common/make.defs - ${MAKE} -C $PRK_TARGET_PATH p2p-tasks-openmp p2p-innerloop-openmp stencil-openmp transpose-openmp + ${MAKE} -C $PRK_TARGET_PATH nstream-openmp p2p-tasks-openmp p2p-hyperplane-openmp stencil-openmp transpose-openmp + $PRK_TARGET_PATH/nstream-openmp 10 16777216 32 $PRK_TARGET_PATH/p2p-tasks-openmp 10 1024 1024 100 100 - $PRK_TARGET_PATH/p2p-innerloop-openmp 10 1024 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 + $PRK_TARGET_PATH/p2p-hyperplane-openmp 10 1024 32 $PRK_TARGET_PATH/stencil-openmp 10 1000 $PRK_TARGET_PATH/transpose-openmp 10 1024 32 #echo "Test stencil code generator" @@ -247,18 +237,19 @@ case "$PRK_TARGET" in esac # C11 with Cilk - if [ "${CC}" = "gcc" ] ; then - echo "CILKFLAG=-fcilkplus" >> common/make.defs - ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk - $PRK_TARGET_PATH/stencil-cilk 10 1000 - $PRK_TARGET_PATH/transpose-cilk 10 1024 32 - #echo "Test stencil code generator" - for s in star grid ; do - for r in 1 2 3 4 5 ; do - $PRK_TARGET_PATH/stencil-cilk 10 200 $s $r - done - done - fi + #if [ "${CC}" = "gcc" ] ; then + # echo "CILKFLAG=-fcilkplus" >> common/make.defs + # ${MAKE} -C $PRK_TARGET_PATH stencil-cilk transpose-cilk + # $PRK_TARGET_PATH/stencil-cilk 10 1000 + # $PRK_TARGET_PATH/transpose-cilk 10 1024 32 + # #echo "Test stencil code generator" + # for s in star grid ; do + # for r in 1 2 3 4 5 ; do + # $PRK_TARGET_PATH/stencil-cilk 10 200 $s $r + # done + # done + #fi + # Use MUSL for GCC+Linux only if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$CC" = "gcc" ] ; then ${MAKE} -C $PRK_TARGET_PATH clean From c1d8e90dc2717dfaf1a2dd6a45c60e1e9df715c3 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 14 May 2019 10:43:42 -0400 Subject: [PATCH 210/245] add nstream-target --- .gitignore | 1 + C1z/Makefile | 2 +- C1z/nstream-target.c | 178 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 C1z/nstream-target.c diff --git a/.gitignore b/.gitignore index 2c32fc0c8..846e4a560 100644 --- a/.gitignore +++ b/.gitignore @@ -102,6 +102,7 @@ SERIAL/Synch_p2p/p2p SERIAL/Transpose/transpose C1z/nstream C1z/nstream-openmp +C1z/nstream-target C1z/nstream-mpi C1z/nstream-memkind C1z/nstream-memkind-openmp diff --git a/C1z/Makefile b/C1z/Makefile index 23562f35f..892564b72 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -57,7 +57,7 @@ memkind: nstream-memkind nstream-memkind-openmp mmap: nstream-mmap nstream-mmap-openmp -target: stencil-target transpose-target +target: nstream-target stencil-target transpose-target taskloop: stencil-taskloop transpose-taskloop diff --git a/C1z/nstream-target.c b/C1z/nstream-target.c new file mode 100644 index 000000000..244528448 --- /dev/null +++ b/C1z/nstream-target.c @@ -0,0 +1,178 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); + printf("C11/OpenMP TARGET STREAM triad: A = B + scalar * C\n"); + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> \n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Matrix length must be greater than 0\n"); + return 1; + } + +#ifdef _OPENMP + printf("Number of threads = %d\n", omp_get_max_threads()); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + double * restrict A = prk_malloc(bytes); + double * restrict B = prk_malloc(bytes); + double * restrict C = prk_malloc(bytes); + + double scalar = 3.0; + + // HOST + OMP_PARALLEL() + { + OMP_FOR_SIMD() + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + return 0; +} + + From f1450ab2ebf36236652f29589a7e677fd5f4bfb1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 14 May 2019 11:37:59 -0400 Subject: [PATCH 211/245] add taskloop nstream --- C1z/Makefile | 2 +- C1z/nstream-taskloop.c | 185 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 C1z/nstream-taskloop.c diff --git a/C1z/Makefile b/C1z/Makefile index 892564b72..9125fef1f 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -59,7 +59,7 @@ mmap: nstream-mmap nstream-mmap-openmp target: nstream-target stencil-target transpose-target -taskloop: stencil-taskloop transpose-taskloop +taskloop: nstream-taskloop stencil-taskloop transpose-taskloop cilk: stencil-cilk transpose-cilk diff --git a/C1z/nstream-taskloop.c b/C1z/nstream-taskloop.c new file mode 100644 index 000000000..69ae72639 --- /dev/null +++ b/C1z/nstream-taskloop.c @@ -0,0 +1,185 @@ +/// +/// Copyright (c) 2019, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// Converted to C11 by Jeff Hammond, February 2019. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" + +int main(int argc, char * argv[]) +{ + printf("Parallel Research Kernels version %.2f\n", PRKVERSION ); +#ifdef _OPENMP + printf("C11/OpenMP TASKLOOP STREAM triad: A = B + scalar * C\n"); +#else + printf("C11/Serial STREAM triad: A = B + scalar * C\n"); +#endif + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + if (argc < 3) { + printf("Usage: <# iterations> []\n"); + return 1; + } + + // number of times to do the transpose + int iterations = atoi(argv[1]); + if (iterations < 1) { + printf("ERROR: iterations must be >= 1\n"); + return 1; + } + + // length of a the matrix + size_t length = atol(argv[2]); + if (length <= 0) { + printf("ERROR: Matrix length must be greater than 0\n"); + return 1; + } + + // taskloop grainsize + int gs = (argc > 3) ? atoi(argv[3]) : 1024; + + +#ifdef _OPENMP + printf("Number of threads = %d\n", omp_get_max_threads()); + printf("Taskloop grainsize = %d\n", gs); +#endif + printf("Number of iterations = %d\n", iterations); + printf("Vector length = %zu\n", length); + //printf("Offset = %d\n", offset); + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time = 0.0; + + size_t bytes = length*sizeof(double); + double * restrict A = prk_malloc(bytes); + double * restrict B = prk_malloc(bytes); + double * restrict C = prk_malloc(bytes); + + double scalar = 3.0; + + OMP_PARALLEL() + OMP_MASTER + { + OMP_TASKLOOP( firstprivate(length) shared(A,B,C) grainsize(gs) ) + for (size_t i=0; i epsilon) { + printf("Failed Validation on output array\n" + " Expected checksum: %lf\n" + " Observed checksum: %lf\n" + "ERROR: solution did not validate\n", ar, asum); + return 1; + } else { + printf("Solution validates\n"); + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(double); + printf("Rate (MB/s): %lf Avg time (s): %lf\n", 1.e-6*nbytes/avgtime, avgtime); + } + + return 0; +} + + From 1ff511c7d15cdb2a7f040dae7742cef1281f0b9a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 14 May 2019 11:39:12 -0400 Subject: [PATCH 212/245] ignore stuff [ci skip] --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 846e4a560..d4640217a 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ SERIAL/Transpose/transpose C1z/nstream C1z/nstream-openmp C1z/nstream-target +C1z/nstream-taskloop C1z/nstream-mpi C1z/nstream-memkind C1z/nstream-memkind-openmp From 5394bed370b6b50e73062231f9d47ed9bfc31197 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 19 May 2019 21:53:02 -0700 Subject: [PATCH 213/245] use correct Kokkos exec space name (thanks Christian) --- Cxx11/stencil-kokkos.cc | 2 +- Cxx11/transpose-kokkos.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/stencil-kokkos.cc b/Cxx11/stencil-kokkos.cc index f5c3365ba..90ea21eaf 100644 --- a/Cxx11/stencil-kokkos.cc +++ b/Cxx11/stencil-kokkos.cc @@ -144,7 +144,7 @@ int main(int argc, char* argv[]) std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; std::cout << "Radius of stencil = " << radius << std::endl; std::cout << "Compact representation of stencil loop body" << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl; auto stencil = nothing; if (star) { diff --git a/Cxx11/transpose-kokkos.cc b/Cxx11/transpose-kokkos.cc index 9b5a4f6c0..2c5c8e2ca 100644 --- a/Cxx11/transpose-kokkos.cc +++ b/Cxx11/transpose-kokkos.cc @@ -111,7 +111,7 @@ int main(int argc, char * argv[]) std::cout << "Matrix order = " << order << std::endl; std::cout << "Tile size = " << tile_size << std::endl; std::cout << "Permute loops = " << (permute ? "yes" : "no") << std::endl; - std::cout << "Kokkos execution space: " << typeid(Kokkos::DefaultExecutionSpace).name() << std::endl; + std::cout << "Kokkos execution space: " << Kokkos::DefaultExecutionSpace::name() << std::endl; ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation From 8726a7543355fbbb9a0e187949fa87211bf59aed Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 8 Aug 2019 16:55:09 -0700 Subject: [PATCH 214/245] add Intel SYCL toolchain to LLVM example --- common/make.defs.llvm | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 092da96b8..180664d73 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -12,7 +12,7 @@ CC=${LLVM_PATH}clang -std=c11 -pthread # All of the Fortran code is written for the 2008 standard and requires preprocessing. FC=/opt/llvm/pgi-flang/bin/flang -Mpreprocess -Mfreeform -L/opt/llvm/pgi-flang/lib -Wl,-rpath=/opt/llvm/pgi-flang/lib # C++11 may not be required but does no harm here. -CXX=${LLVM_PATH}clang++ -std=c++1z -pthread +CXX=${LLVM_PATH}clang++ -std=c++17 -pthread # # Compiler flags # @@ -57,11 +57,16 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # # SYCL flags # +# Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md +SYCLDIR=/opt/isycl +SYCLCXX=${SYCLDIR}/bin/clang++ +SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +SYCLFLAG+=-std=c++17 -O3 # CodePlay ComputeCpp -SYCLDIR=/opt/sycl/latest -SYCLCXX=${SYCLDIR}/bin/compute++ -SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp -SYCLFLAG+=-std=c++14 -O3 +#SYCLDIR=/opt/sycl/latest +#SYCLCXX=${SYCLDIR}/bin/compute++ +#SYCLFLAG=-DUSE_SYCL -sycl-driver -I$(SYCLDIR)/include -L$(SYCLDIR)/lib -Wl,-rpath=$(SYCLDIR)/lib -lComputeCpp +#SYCLFLAG+=-std=c++14 -O3 # This makes a huge difference in e.g. nstream... #SYCLFLAG+=-no-serial-memop # CentOS7 and Ubuntu14 built for this @@ -90,6 +95,7 @@ SYCLFLAG+=-std=c++14 -O3 # # TBB # +#TBBDIR=/usr/lib/x86_64-linux-gnu TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb From 6d3e897d7621d12d0f0862c346c196793e0f5c99 Mon Sep 17 00:00:00 2001 From: Aksel Alpay Date: Thu, 22 Aug 2019 17:24:20 +0000 Subject: [PATCH 215/245] Add initial hipSYCL support --- Cxx11/nstream-explicit-sycl.cc | 10 ++++++++-- Cxx11/nstream-sycl.cc | 11 ++++++++--- Cxx11/prk_util.h | 23 ++++++++++++++++++++++- Cxx11/stencil-sycl.cc | 19 ++++++++++++++++--- Cxx11/transpose-explicit-sycl.cc | 12 +++++++++--- Cxx11/transpose-sycl.cc | 10 ++++++++-- 6 files changed, 71 insertions(+), 14 deletions(-) diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc index e51b78a28..0213b95bc 100644 --- a/Cxx11/nstream-explicit-sycl.cc +++ b/Cxx11/nstream-explicit-sycl.cc @@ -245,6 +245,7 @@ int main(int argc, char * argv[]) #endif try { +#if SYCL_TRY_CPU_QUEUE if (length<100000) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -258,11 +259,13 @@ int main(int argc, char * argv[]) } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } +#endif // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = cpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -276,11 +279,13 @@ int main(int argc, char * argv[]) run(cpu, iterations, length); } } +#endif // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE if (1) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = gpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -310,6 +315,7 @@ int main(int argc, char * argv[]) #endif } } +#endif } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index f7d42d732..b823f220a 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -227,6 +227,7 @@ int main(int argc, char * argv[]) #endif try { +#if SYCL_TRY_CPU_QUEUE if (length<100000) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -240,11 +241,13 @@ int main(int argc, char * argv[]) } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } +#endif // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = cpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -258,11 +261,12 @@ int main(int argc, char * argv[]) run(cpu, iterations, length); } } - +#endif // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE if (1) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = gpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -292,6 +296,7 @@ int main(int argc, char * argv[]) #endif } } +#endif } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index d09c16c3b..2a917ad68 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -82,13 +82,34 @@ #define PRK_UNUSED #endif + // for SYCL -#ifdef TRISYCL + +// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL +#if defined(TRISYCL) || defined(__HIPSYCL__) #define PREBUILD_KERNEL 0 #else #define PREBUILD_KERNEL 1 #endif +// not all SYCL implementations may support all device types. +// If an implementation does not find any devices based on a +// device selector, it will throw an exception. +// These macros can be used to check if there's any chance +// of an implementation targeting a CPU and GPU. +#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU) +#define SYCL_TRY_CPU_QUEUE 1 +#else +#define SYCL_TRY_CPU_QUEUE 0 +#endif + +#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU) +#define SYCL_TRY_GPU_QUEUE 1 +#else +#define SYCL_TRY_GPU_QUEUE 0 +#endif + + namespace prk { int get_alignment(void) diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index d9fa54ff6..53b643187 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -64,6 +64,7 @@ #include "prk_util.h" #include "stencil_sycl.hpp" + #if 0 #include "prk_opencl.h" #define USE_OPENCL 1 @@ -83,7 +84,13 @@ void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; std::cout << "and add it to the case-switch in the driver." << std::endl; + // There seems to be an issue with the clang CUDA/HIP toolchains not having + // std::abort() available +#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) + abort(); +#else std::abort(); +#endif } template @@ -322,9 +329,10 @@ int main(int argc, char * argv[]) try { +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = host.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -334,11 +342,13 @@ int main(int argc, char * argv[]) run(host, iterations, n, tile_size, star, radius); run(host, iterations, n, tile_size, star, radius); } +#endif // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = cpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -352,11 +362,13 @@ int main(int argc, char * argv[]) run(cpu, iterations, n, tile_size, star, radius); } } +#endif // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE if (0) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = gpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -386,6 +398,7 @@ int main(int argc, char * argv[]) #endif } } +#endif } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc index cedeafd68..2a5cfbf12 100644 --- a/Cxx11/transpose-explicit-sycl.cc +++ b/Cxx11/transpose-explicit-sycl.cc @@ -258,9 +258,10 @@ int main(int argc, char * argv[]) #endif try { +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = host.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -269,11 +270,13 @@ int main(int argc, char * argv[]) run(host, iterations, order); run(host, iterations, order); } +#endif // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = cpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -287,11 +290,13 @@ int main(int argc, char * argv[]) run(cpu, iterations, order); } } +#endif // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE if (0) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = gpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -321,6 +326,7 @@ int main(int argc, char * argv[]) #endif } } +#endif } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 761fa136d..0323a08ac 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -224,6 +224,7 @@ int main(int argc, char * argv[]) #endif try { +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue host(cl::sycl::host_selector{}); #ifndef TRISYCL @@ -235,11 +236,13 @@ int main(int argc, char * argv[]) run(host, iterations, order); run(host, iterations, order); } +#endif // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE if (1) { cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = cpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -253,11 +256,13 @@ int main(int argc, char * argv[]) run(cpu, iterations, order); } } +#endif // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE if (0) { cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#ifndef TRISYCL +#if !defined(TRISYCL) && !defined(__HIPSYCL__) auto device = gpu.get_device(); auto platform = device.get_platform(); std::cout << "SYCL Device: " << device.get_info() << std::endl; @@ -287,6 +292,7 @@ int main(int argc, char * argv[]) #endif } } +#endif } catch (cl::sycl::exception e) { std::cout << e.what() << std::endl; From a9353475c919074bf4fa67113fecd0de607eebd4 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 29 Aug 2019 18:00:29 -0700 Subject: [PATCH 216/245] fix copy-and-paste error --- Cxx11/prk_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 2a917ad68..c0d5d321f 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -103,7 +103,7 @@ #define SYCL_TRY_CPU_QUEUE 0 #endif -#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU) +#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_GPU) #define SYCL_TRY_GPU_QUEUE 1 #else #define SYCL_TRY_GPU_QUEUE 0 From 6974328dc685fba5d13ed5e72db64bebe0d3702d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 20 Aug 2019 12:35:49 -0700 Subject: [PATCH 217/245] catch exceptions by reference in SYCL codes --- Cxx11/nstream-explicit-sycl.cc | 8 ++++---- Cxx11/nstream-sycl.cc | 8 ++++---- Cxx11/stencil-sycl.cc | 8 ++++---- Cxx11/transpose-explicit-sycl.cc | 8 ++++---- Cxx11/transpose-sycl.cc | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-explicit-sycl.cc index 0213b95bc..5201a48b1 100644 --- a/Cxx11/nstream-explicit-sycl.cc +++ b/Cxx11/nstream-explicit-sycl.cc @@ -142,7 +142,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) }); q.wait(); } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -153,7 +153,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) #endif return; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return; } @@ -317,7 +317,7 @@ int main(int argc, char * argv[]) } #endif } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -328,7 +328,7 @@ int main(int argc, char * argv[]) #endif return 1; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return 1; } diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index b823f220a..2657f7200 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -124,7 +124,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) // for other device-oriented programming models. nstream_time = prk::wtime() - nstream_time; } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -135,7 +135,7 @@ void run(cl::sycl::queue & q, int iterations, size_t length) #endif return; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return; } @@ -298,7 +298,7 @@ int main(int argc, char * argv[]) } #endif } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -309,7 +309,7 @@ int main(int argc, char * argv[]) #endif return 1; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return 1; } diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index 53b643187..b333c4194 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -195,7 +195,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s } stencil_time = prk::wtime() - stencil_time; } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -206,7 +206,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s #endif return; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return; } @@ -400,7 +400,7 @@ int main(int argc, char * argv[]) } #endif } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -411,7 +411,7 @@ int main(int argc, char * argv[]) #endif return 1; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return 1; } diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-explicit-sycl.cc index 2a5cfbf12..e92dfaa1f 100644 --- a/Cxx11/transpose-explicit-sycl.cc +++ b/Cxx11/transpose-explicit-sycl.cc @@ -157,7 +157,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order) }); q.wait(); } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -168,7 +168,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order) #endif return; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return; } @@ -328,7 +328,7 @@ int main(int argc, char * argv[]) } #endif } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -339,7 +339,7 @@ int main(int argc, char * argv[]) #endif return 1; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return 1; } diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index 0323a08ac..b22b162be 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -123,7 +123,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order) // for other device-oriented programming models. trans_time = prk::wtime() - trans_time; } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -134,7 +134,7 @@ void run(cl::sycl::queue & q, int iterations, size_t order) #endif return; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return; } @@ -294,7 +294,7 @@ int main(int argc, char * argv[]) } #endif } - catch (cl::sycl::exception e) { + catch (cl::sycl::exception & e) { std::cout << e.what() << std::endl; #ifdef __COMPUTECPP__ std::cout << e.get_file_name() << std::endl; @@ -305,7 +305,7 @@ int main(int argc, char * argv[]) #endif return 1; } - catch (std::exception e) { + catch (std::exception & e) { std::cout << e.what() << std::endl; return 1; } From 3a7e165bfab8064d73328da13bfd9c48d677cfc9 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 29 Aug 2019 18:21:41 -0700 Subject: [PATCH 218/245] revert incorrect fix and add comment so future Jeff understands --- Cxx11/prk_util.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index c0d5d321f..7feaec93d 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -103,7 +103,8 @@ #define SYCL_TRY_CPU_QUEUE 0 #endif -#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_GPU) +// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) ) +#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU) #define SYCL_TRY_GPU_QUEUE 1 #else #define SYCL_TRY_GPU_QUEUE 0 From 443540af80438f3de4b2786f08e7673d7e83af6c Mon Sep 17 00:00:00 2001 From: Toby Isaac Date: Tue, 10 Sep 2019 08:28:42 -0400 Subject: [PATCH 219/245] clean OpenMP target stencil in C1z Same as previous by jeffhammond in Cxx11/: > - GPU-style target means the functions are invoked on host, so must > remove "declare target" for correctness (caught by LLVM 5) --- C1z/generate-c-stencil.py | 8 ++++---- C1z/stencil_target.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/C1z/generate-c-stencil.py b/C1z/generate-c-stencil.py index b16dc8dcb..fb1a57f48 100755 --- a/C1z/generate-c-stencil.py +++ b/C1z/generate-c-stencil.py @@ -76,13 +76,13 @@ def instance(src,model,pattern,r): def main(): for model in ['seq','openmp','target','cilk','taskloop']: src = open('stencil_'+model+'.h','w') - if (model=='target'): - src.write('OMP( declare target )\n') + #if (model=='target'): + # src.write('OMP( declare target )\n') for pattern in ['star','grid']: for r in range(1,10): instance(src,model,pattern,r) - if (model=='target'): - src.write('OMP( end declare target )\n') + #if (model=='target'): + # src.write('OMP( end declare target )\n') src.close() if __name__ == '__main__': diff --git a/C1z/stencil_target.h b/C1z/stencil_target.h index 4f7edfd36..ae64a29f6 100644 --- a/C1z/stencil_target.h +++ b/C1z/stencil_target.h @@ -1,4 +1,3 @@ -OMP( declare target ) void star1(const int n, const double * restrict in, double * restrict out) { OMP_TARGET( teams distribute parallel for simd collapse(2) schedule(static,1) ) for (int i=1; i Date: Sat, 21 Sep 2019 20:05:15 -0700 Subject: [PATCH 220/245] do not map timers --- Cxx11/nstream-openmp-target.cc | 2 +- Cxx11/transpose-openmp-target.cc | 24 ++++++++++++------------ FORTRAN/dgemm-openmp-target.f90 | 3 +-- FORTRAN/nstream-openmp-target.f90 | 3 +-- FORTRAN/stencil-openmp-target.f90 | 3 +-- FORTRAN/transpose-openmp-target.f90 | 2 +- 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc index 06af1c204..d4a437a08 100644 --- a/Cxx11/nstream-openmp-target.cc +++ b/Cxx11/nstream-openmp-target.cc @@ -129,7 +129,7 @@ int main(int argc, char * argv[]) } // DEVICE - OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) map(from:nstream_time) ) + OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) ) { for (auto iter = 0; iter<=iterations; iter++) { diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc index a611997f5..8702ec45b 100644 --- a/Cxx11/transpose-openmp-target.cc +++ b/Cxx11/transpose-openmp-target.cc @@ -111,9 +111,9 @@ int main(int argc, char * argv[]) OMP_PARALLEL() { OMP_FOR() - for (auto i=0;i(i*order+j); B[i*order+j] = 0.0; } @@ -121,19 +121,19 @@ int main(int argc, char * argv[]) } // DEVICE - OMP_TARGET( data map(tofrom: A[0:order*order], B[0:order*order]) map(from:trans_time) ) + OMP_TARGET( data map(tofrom: A[0:order*order], B[0:order*order]) ) { - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) trans_time = omp_get_wtime(); // transpose the matrix if (tile_size < order) { OMP_TARGET( teams distribute parallel for simd collapse(2) ) - for (auto it=0; it(ij)*(1.+iterations)+addit; diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.f90 index 3c8ffbeec..d1af37ba5 100644 --- a/FORTRAN/dgemm-openmp-target.f90 +++ b/FORTRAN/dgemm-openmp-target.f90 @@ -149,8 +149,7 @@ program main enddo !$omp end parallel do - !$omp target data map(to: A,B) map(tofrom: C) map(from:dgemm_time) & - !$omp& map(to:iterations,order) + !$omp target data map(to: A,B) map(tofrom: C) map(to:order) t0 = 0 diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.f90 index 954a86b1e..13e77f628 100644 --- a/FORTRAN/nstream-openmp-target.f90 +++ b/FORTRAN/nstream-openmp-target.f90 @@ -163,8 +163,7 @@ program main enddo !$omp end parallel do simd - !$omp target data map(tofrom: A) map(to: B,C) map(from:nstream_time) & - !$omp& map(to:iterations,length) + !$omp target data map(tofrom: A) map(to: B,C) map(to:length) do k=0,iterations diff --git a/FORTRAN/stencil-openmp-target.f90 b/FORTRAN/stencil-openmp-target.f90 index f7724ada9..7bceb70e1 100644 --- a/FORTRAN/stencil-openmp-target.f90 +++ b/FORTRAN/stencil-openmp-target.f90 @@ -320,8 +320,7 @@ program main #endif !$omp end parallel - !$omp target data map(to:W, A) map(tofrom: B) map(from:stencil_time) & - !$omp& map(to:iterations,n) + !$omp target data map(to:W, A) map(tofrom: B) map(to:n) t0 = 0 diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.f90 index 1da28346a..a2c08e1a8 100644 --- a/FORTRAN/transpose-openmp-target.f90 +++ b/FORTRAN/transpose-openmp-target.f90 @@ -143,7 +143,7 @@ program main enddo !$omp end parallel do simd - !$omp target data map(to: A) map(tofrom: B) map(from:trans_time) + !$omp target data map(to: A) map(tofrom: B) t0 = 0 From a106364e221ac93ae284c18ea7b9ac8f9cb3059e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2019 14:30:53 -0700 Subject: [PATCH 221/245] eliminate std::math in target region because not declare target (#415) also remove half-baked p2p --- Cxx11/p2p-openmp-target.cc | 185 ------------------------------- Cxx11/prk_openmp.h | 11 ++ Cxx11/transpose-openmp-target.cc | 4 +- 3 files changed, 13 insertions(+), 187 deletions(-) delete mode 100644 Cxx11/p2p-openmp-target.cc diff --git a/Cxx11/p2p-openmp-target.cc b/Cxx11/p2p-openmp-target.cc deleted file mode 100644 index a9220285f..000000000 --- a/Cxx11/p2p-openmp-target.cc +++ /dev/null @@ -1,185 +0,0 @@ -/// -/// Copyright (c) 2013, Intel Corporation -/// -/// Redistribution and use in source and binary forms, with or without -/// modification, are permitted provided that the following conditions -/// are met: -/// -/// * Redistributions of source code must retain the above copyright -/// notice, this list of conditions and the following disclaimer. -/// * Redistributions in binary form must reproduce the above -/// copyright notice, this list of conditions and the following -/// disclaimer in the documentation and/or other materials provided -/// with the distribution. -/// * Neither the name of Intel Corporation nor the names of its -/// contributors may be used to endorse or promote products -/// derived from this software without specific prior written -/// permission. -/// -/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -/// POSSIBILITY OF SUCH DAMAGE. - -////////////////////////////////////////////////////////////////////// -/// -/// NAME: Pipeline -/// -/// PURPOSE: This program tests the efficiency with which point-to-point -/// synchronization can be carried out. It does so by executing -/// a pipelined algorithm on an m*n grid. The first array dimension -/// is distributed among the threads (stripwise decomposition). -/// -/// USAGE: The program takes as input the -/// dimensions of the grid, and the number of iterations on the grid -/// -/// -/// -/// The output consists of diagnostics to make sure the -/// algorithm worked, and of timing statistics. -/// -/// FUNCTIONS CALLED: -/// -/// Other than standard C functions, the following -/// functions are used in this program: -/// -/// wtime() -/// -/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. -/// C99-ification by Jeff Hammond, February 2016. -/// C++11-ification by Jeff Hammond, May 2017. -/// -////////////////////////////////////////////////////////////////////// - -#include "prk_util.h" - -int main(int argc, char* argv[]) -{ - std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; - std::cout << "C++11/OpenMP TARGET DOACROSS pipeline execution on 2D grid" << std::endl; - - ////////////////////////////////////////////////////////////////////// - // Process and test input parameters - ////////////////////////////////////////////////////////////////////// - - int iterations; - int m, n; - try { - if (argc < 4){ - throw " <# iterations> "; - } - - // number of times to run the pipeline algorithm - iterations = std::atoi(argv[1]); - if (iterations < 1) { - throw "ERROR: iterations must be >= 1"; - } - - // grid dimensions - m = std::atoi(argv[2]); - n = std::atoi(argv[3]); - if (m < 1 || n < 1) { - throw "ERROR: grid dimensions must be positive"; - } else if ( static_cast(m)*static_cast(n) > INT_MAX) { - throw "ERROR: grid dimension too large - overflow risk"; - } - } - catch (const char * e) { - std::cout << e << std::endl; - return 1; - } - - std::cout << "Number of threads (max) = " << omp_get_max_threads() << std::endl; - std::cout << "Number of iterations = " << iterations << std::endl; - std::cout << "Grid sizes = " << m << ", " << n << std::endl; - - ////////////////////////////////////////////////////////////////////// - // Allocate space and perform the computation - ////////////////////////////////////////////////////////////////////// - - auto pipeline_time = 0.0; // silence compiler warning - - // working set - double * grid = new double[m*n]; - - OMP_PARALLEL() - { - OMP_FOR() - for (auto i=0; i(j); - } - for (auto i=0; i(i); - } - } - OMP_BARRIER - } - - OMP_TARGET( data map(tofrom:grid[0:m*n]) map(from:pipeline_time) ) - { - for (auto iter = 0; iter<=iterations; iter++) { - - if (iter==1) pipeline_time = omp_get_wtime(); - - OMP_PARALLEL() { - OMP_FOR( collapse(2) ordered(2) ) - for (auto i=1; i epsilon) { - std::cout << "ERROR: checksum " << grid[(m-1)*n+(n-1)] - << " does not match verification value " << corner_val << std::endl; - return 1; - } - -#ifdef VERBOSE - std::cout << "Solution validates; verification value = " << corner_val << std::endl; -#else - std::cout << "Solution validates" << std::endl; -#endif - auto avgtime = pipeline_time/iterations; - std::cout << "Rate (MFlops/s): " - << 2.0e-6 * ( (m-1.)*(n-1.) )/avgtime - << " Avg time (s): " << avgtime << std::endl; - - return 0; -} diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h index 4d6396b9b..578e713e5 100644 --- a/Cxx11/prk_openmp.h +++ b/Cxx11/prk_openmp.h @@ -91,4 +91,15 @@ # define OMP_END_DECLARE_TARGET #endif +// used in OpenMP target code because std::min etc are not declare target +#ifndef MIN +#define MIN(x,y) ((x)<(y)?(x):(y)) +#endif +#ifndef MAX +#define MAX(x,y) ((x)>(y)?(x):(y)) +#endif +#ifndef ABS +#define ABS(a) ((a) >= 0 ? (a) : -(a)) +#endif + #endif /* PRK_OPENMP_H */ diff --git a/Cxx11/transpose-openmp-target.cc b/Cxx11/transpose-openmp-target.cc index 8702ec45b..b106e6f48 100644 --- a/Cxx11/transpose-openmp-target.cc +++ b/Cxx11/transpose-openmp-target.cc @@ -132,8 +132,8 @@ int main(int argc, char * argv[]) OMP_TARGET( teams distribute parallel for simd collapse(2) ) for (int it=0; it Date: Fri, 4 Aug 2017 14:44:32 -0700 Subject: [PATCH 222/245] add ignores --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index d4640217a..0b5905f43 100644 --- a/.gitignore +++ b/.gitignore @@ -118,6 +118,8 @@ C1z/p2p-hyperplane C1z/p2p-hyperplane-openmp C1z/p2p-tasks-openmp C1z/p2p-simd-openmp +C1z/p2p-avx +C1z/p2p-sse C1z/stencil C1z/stencil-cilk C1z/stencil-openmp From f0c24d7f16a0c3416146f60a8ccea6142617ebcb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 30 Sep 2019 21:03:31 -0700 Subject: [PATCH 223/245] add example of all the crap required when using https://github.com/boostorg/boost --- common/make.defs.boost | 186 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 common/make.defs.boost diff --git a/common/make.defs.boost b/common/make.defs.boost new file mode 100644 index 000000000..d9065433a --- /dev/null +++ b/common/make.defs.boost @@ -0,0 +1,186 @@ +# +# This file shows the GCC toolchain options for PRKs using +# OpenMP, MPI and/or Fortran coarrays only. +# +# Base compilers and language options +# +VERSION=-9 +# C99 is required in some implementations. +CC=gcc${VERSION} -std=c11 -pthread +#EXTRA_CLIBS=-lrt +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +FC=gfortran${VERSION} -std=f2008 -cpp +# C++11 may not be required but does no harm here. +#CXX=g++${VERSION} -std=gnu++17 -pthread +CXX=clang++ -std=gnu++17 -pthread +# +# Compiler flags +# +# -mtune=native is appropriate for most cases. +# -march=native is appropriate if you want portable binaries. +DEFAULT_OPT_FLAGS=-O3 -mtune=native -ffast-math +#DEFAULT_OPT_FLAGS=-O0 +DEFAULT_OPT_FLAGS+=-g3 +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined +#DEFAULT_OPT_FLAGS+=-fsanitize=undefined,leak +#DEFAULT_OPT_FLAGS+=-fsanitize=address +#DEFAULT_OPT_FLAGS+=-fsanitize=thread +# If you are compiling for KNL on a Xeon login node, use the following: +# DEFAULT_OPT_FLAGS=-g -O3 -march=knl +# See https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html for details. +# +#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall #-Werror +DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-mavx -mfma +# +# OpenMP flags +# +OPENMPFLAG=-fopenmp +OPENMPSIMDFLAG=-fopenmp-simd +OFFLOADFLAG=-foffload="-O3 -v" +ORNLACCFLAG=-fopenacc +# +# OpenCL flags +# +# MacOS +#OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +OPENCLFLAG=-I/opt/pocl/include -I/opt/pocl/share/pocl/include -L/opt/pocl/lib -Wl,-rpath -Wl,/opt/pocl/lib -lpocl +# Linux +#OPENCLDIR=/etc/alternatives/opencl-intel-tools +#OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +OPENCLFLAG+=-Wno-ignored-attributes -Wno-deprecated-declarations +METALFLAG=-framework MetalPerformanceShaders +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# Cilk +# +#CILKFLAG=-fcilkplus +# +# TBB +# +TBBDIR=/usr/local/Cellar/tbb/2019_U5_1 +TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +# +# Parallel STL, Boost, etc. +# +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include +BOOSTROOT=/Users/jrhammon/Work/Languages/boost/libs +BOOSTFLAG= +BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include +BOOSTFLAG+=-I${BOOSTROOT}/compute/include +BOOSTFLAG+=-I${BOOSTROOT}/algorithm/include +BOOSTFLAG+=-I${BOOSTROOT}/config/include +BOOSTFLAG+=-I${BOOSTROOT}/core/include +BOOSTFLAG+=-I${BOOSTROOT}/log/include +BOOSTFLAG+=-I${BOOSTROOT}/array/include +BOOSTFLAG+=-I${BOOSTROOT}/multi_array/include +BOOSTFLAG+=-I${BOOSTROOT}/optional/include +BOOSTFLAG+=-I${BOOSTROOT}/preprocessor/include +BOOSTFLAG+=-I${BOOSTROOT}/type_index/include +BOOSTFLAG+=-I${BOOSTROOT}/utility/include +BOOSTFLAG+=-I${BOOSTROOT}/assert/include +BOOSTFLAG+=-I${BOOSTROOT}/static_assert/include +BOOSTFLAG+=-I${BOOSTROOT}/exception/include +BOOSTFLAG+=-I${BOOSTROOT}/throw_exception/include +BOOSTFLAG+=-I${BOOSTROOT}/concept_check/include +BOOSTFLAG+=-I${BOOSTROOT}/type_traits/include +BOOSTFLAG+=-I${BOOSTROOT}/iterator/include +BOOSTFLAG+=-I${BOOSTROOT}/mpl/include +BOOSTFLAG+=-I${BOOSTROOT}/detail/include +BOOSTFLAG+=-I${BOOSTROOT}/functional/include +BOOSTFLAG+=-I${BOOSTROOT}/move/include +BOOSTFLAG+=-I${BOOSTROOT}/range/include +BOOSTFLAG+=-I${BOOSTROOT}/function/include +BOOSTFLAG+=-I${BOOSTROOT}/integer/include +BOOSTFLAG+=-I${BOOSTROOT}/container_hash/include +BOOSTFLAG+=-I${BOOSTROOT}/bind/include +BOOSTFLAG+=-I${BOOSTROOT}/chrono/include +BOOSTFLAG+=-I${BOOSTROOT}/predef/include +BOOSTFLAG+=-I${BOOSTROOT}/ratio/include +BOOSTFLAG+=-I${BOOSTROOT}/function_types/include +BOOSTFLAG+=-I${BOOSTROOT}/tuple/include +BOOSTFLAG+=-I${BOOSTROOT}/lexical_cast/include +BOOSTFLAG+=-I${BOOSTROOT}/numeric/conversion/include +BOOSTFLAG+=-I${BOOSTROOT}/container/include +BOOSTFLAG+=-I${BOOSTROOT}/math/include +BOOSTFLAG+=-I${BOOSTROOT}/fusion/include +BOOSTFLAG+=-I${BOOSTROOT}/typeof/include +BOOSTFLAG+=-I${BOOSTROOT}/uuid/include +BOOSTFLAG+=-I${BOOSTROOT}/smart_ptr/include +BOOSTFLAG+=-I${BOOSTROOT}/proto/include +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +##RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} +KOKKOSDIR=/opt/kokkos/gcc +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} +RAJADIR=/opt/raja/gcc +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG} +SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL +# ProGTX +# https://github.com/ProGTX/sycl-gtx +#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx +#SYCLCXX=${CXX} ${OPENMPFLAG} +#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} +SYCLFLAG+=${RANGEFLAG} +# +# CBLAS for C++ DGEMM +# +BLASFLAG=-DACCELERATE -framework Accelerate +CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +# +# CUDA flags +# +# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander +NVCC=/opt/llvm/cocl/bin/cocl +# Linux w/ NVIDIA CUDA +#NVCC=nvcc +#CUDAFLAGS=-g -O3 -std=c++11 -arch=sm_50 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +#CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 +# +# ISPC +# +ISPC=ispc +ISPCFLAG=-O3 --target=host --opt=fast-math +# +# MPI +# +# We assume you have installed an implementation of MPI-3 that is in your path. +MPICC=mpicc -std=c99 +# +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +COARRAYFLAG=-fcoarray=single -lcaf_single +# multi-node +# COARRAYFLAG=-fcoarray=lib -lcaf_mpi + +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From 2163dc8c2a853e12ed38dbe9051e5f98fcaa395b Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 1 Oct 2019 05:29:29 +0000 Subject: [PATCH 224/245] fix loop index in lambda, now required by USM --- Cxx11/nstream-sycl-usm.cc | 349 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 349 insertions(+) create mode 100644 Cxx11/nstream-sycl-usm.cc diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc new file mode 100644 index 000000000..1aed0931c --- /dev/null +++ b/Cxx11/nstream-sycl-usm.cc @@ -0,0 +1,349 @@ +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: nstream +/// +/// PURPOSE: To compute memory bandwidth when adding a vector of a given +/// number of double precision values to the scalar multiple of +/// another vector of the same length, and storing the result in +/// a third vector. +/// +/// USAGE: The program takes as input the number +/// of iterations to loop over the triad vectors, the length of the +/// vectors, and the offset between vectors +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// NOTES: Bandwidth is determined as the number of words read, plus the +/// number of words written, times the size of the words, divided +/// by the execution time. For a vector length of N, the total +/// number of words read and written is 4*N*sizeof(double). +/// +/// +/// HISTORY: This code is loosely based on the Stream benchmark by John +/// McCalpin, but does not follow all the Stream rules. Hence, +/// reported results should not be associated with Stream in +/// external publications +/// +/// Converted to C++11 by Jeff Hammond, November 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "CL/sycl.hpp" +#include "prk_util.h" + +namespace sycl = cl::sycl; + +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + +template class nstream; + +template +void run(sycl::queue & q, int iterations, size_t length) +{ + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double nstream_time(0); + + const T scalar(3); + + //std::vector h_A(length,0); + //std::vector h_B(length,2); + //std::vector h_C(length,2); + + T * A; + T * B; + T * C; + + try { + + auto ctx = q.get_context(); + auto dev = q.get_device(); + +#if PREBUILD_KERNEL + sycl::program kernel(ctx); + kernel.build_with_kernel_type>(); +#endif + + //sycl::buffer d_A { h_A.data(), sycl::range<1>(h_A.size()) }; + //sycl::buffer d_B { h_B.data(), sycl::range<1>(h_B.size()) }; + //sycl::buffer d_C { h_C.data(), sycl::range<1>(h_C.size()) }; + + A = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); + B = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); + C = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); + + for (size_t i=0; i(h); + //auto B = d_B.template get_access(h); + //auto C = d_C.template get_access(h); + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + sycl::range<1>{length}, [=] (sycl::id<1> it) { + const size_t i = it[0]; + A[i] += B[i] + scalar * C[i]; + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + nstream_time = prk::wtime() - nstream_time; + + sycl::free(A, ctx); + sycl::free(B, ctx); + sycl::free(C, ctx); + + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + return; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return; + } + catch (const char * e) { + std::cout << e << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + T ar(0); + T br(2); + T cr(2); + for (int i=0; i<=iterations; ++i) { + ar += br + scalar * cr; + } + + ar *= length; + + double asum(0); + for (size_t i=0; i epsilon) { + std::cout << "Failed Validation on output array\n" + << " Expected checksum: " << ar << "\n" + << " Observed checksum: " << asum << std::endl; + std::cout << "ERROR: solution did not validate" << std::endl; + } else { + std::cout << "Solution validates" << std::endl; + double avgtime = nstream_time/iterations; + double nbytes = 4.0 * length * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.e-6*nbytes/avgtime + << " Avg time (s): " << avgtime << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL STREAM triad: A = B + scalar * C" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations, offset; + size_t length; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + length = std::atol(argv[2]); + if (length <= 0) { + throw "ERROR: vector length must be positive"; + } + + offset = (argc>3) ? std::atoi(argv[3]) : 0; + if (length <= 0) { + throw "ERROR: offset must be nonnegative"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Vector length = " << length << std::endl; + std::cout << "Offset = " << offset << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + + try { +#if SYCL_TRY_CPU_QUEUE + if (length<100000) { + sycl::queue host(sycl::host_selector{}); +#ifndef TRISYCL + auto device = host.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + run(host, iterations, length); + run(host, iterations, length); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; + } +#endif + + // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE + if (1) { + sycl::queue cpu(sycl::cpu_selector{}); +#if !defined(TRISYCL) && !defined(__HIPSYCL__) + auto device = cpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir")); +#else + bool has_spir = true; // ? +#endif + if (has_spir) { + run(cpu, iterations, length); + run(cpu, iterations, length); + } + } +#endif + // NVIDIA GPU requires ptx64 target and does not work very well +#if SYCL_TRY_GPU_QUEUE + if (1) { + sycl::queue gpu(sycl::gpu_selector{}); +#if !defined(TRISYCL) && !defined(__HIPSYCL__) + auto device = gpu.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; + bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir")); + bool has_fp64 = device.has_extension(sycl::string_class("cl_khr_fp64")); +#else + bool has_spir = true; // ? + bool has_fp64 = true; +#endif + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir) { + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } + } else { + std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; +#ifdef __COMPUTECPP__ + std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; + run(gpu, iterations, length); + if (has_fp64) { + run(gpu, iterations, length); + } +#endif + } + } +#endif + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + return 1; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + return 0; +} + + From 39c8935e324a92d5952019ed9fcbe5f395071d48 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 4 Oct 2019 14:48:23 -0700 Subject: [PATCH 225/245] switch default LLVM SYCL to triSYCL and other fixes --- common/make.defs.llvm | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 180664d73..318e64595 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -58,10 +58,10 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # SYCL flags # # Intel SYCL - https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md -SYCLDIR=/opt/isycl -SYCLCXX=${SYCLDIR}/bin/clang++ -SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib -SYCLFLAG+=-std=c++17 -O3 +#SYCLDIR=/opt/isycl +#SYCLCXX=${SYCLDIR}/bin/clang++ +#SYCLFLAG=-fsycl -lsycl -lOpenCL -Wl,-rpath=${SYCLDIR}/lib +#SYCLFLAG+=-std=c++17 -O3 # CodePlay ComputeCpp #SYCLDIR=/opt/sycl/latest #SYCLCXX=${SYCLDIR}/bin/compute++ @@ -80,9 +80,10 @@ SYCLFLAG+=-std=c++17 -O3 # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -#SYCLDIR=./triSYCL +SYCLDIR=./triSYCL #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -#SYCLFLAG=-std=gnu++14 -I$(SYCLDIR)/include +SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx From 619f7cac65053dbed7002e0cf18b6a2de83114e0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 8 Oct 2019 21:54:06 -0700 Subject: [PATCH 226/245] more SYCL USM (#422) * add example of all the crap required when using https://github.com/boostorg/boost * fix loop index in lambda, now required by USM * clean SYCL USM * cleanup and homogenize nstream sycl and sycl-usm * cleanup and homogenize nstream sycl and sycl-usm and explicit-sycl * USM transpose * create SYCL util header and factor out a bunch of preprocessor crap * SYCL USM stencil * rename to avoid failed builds with triSYCL * use prk::SYCL namespace for SYCL utils * fix example make.defs * remove trailing whitespace * Stencil SYCL USM * fix build system for SYCL explicit * fix scope issue * fix sycl codegen --- Cxx11/Makefile | 18 +- Cxx11/generate-sycl-stencil.py | 61 ++- ...licit-sycl.cc => nstream-sycl-explicit.cc} | 131 ++--- Cxx11/nstream-sycl-usm.cc | 103 +--- Cxx11/nstream-sycl.cc | 115 ++--- Cxx11/p2p-hyperplane-sycl.cc | 39 +- Cxx11/prk_sycl.h | 104 ++++ Cxx11/prk_util.h | 29 -- Cxx11/stencil-sycl-usm.cc | 341 +++++++++++++ Cxx11/stencil-sycl.cc | 144 ++---- Cxx11/stencil_sycl.hpp | 470 ++++++++++++------ ...cit-sycl.cc => transpose-sycl-explicit.cc} | 149 ++---- Cxx11/transpose-sycl-usm.cc | 276 ++++++++++ Cxx11/transpose-sycl.cc | 123 ++--- common/README.freebsd | 2 +- common/make.defs.gcc | 44 +- common/make.defs.llvm | 23 +- 17 files changed, 1370 insertions(+), 802 deletions(-) rename Cxx11/{nstream-explicit-sycl.cc => nstream-sycl-explicit.cc} (61%) create mode 100644 Cxx11/prk_sycl.h create mode 100644 Cxx11/stencil-sycl-usm.cc rename Cxx11/{transpose-explicit-sycl.cc => transpose-sycl-explicit.cc} (56%) create mode 100644 Cxx11/transpose-sycl-usm.cc diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 1bb8d88ce..84665feaf 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -69,7 +69,7 @@ ifneq ($(findstring pgc++,$(CXX)),pgc++) EXTRA += tbb pstl endif -all: sequential vector valarray openmp taskloop stl rangefor kokkos opencl sycl boost-compute $(EXTRA) # raja +all: sequential vector valarray openmp taskloop stl rangefor opencl sycl boost-compute $(EXTRA) #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ @@ -105,7 +105,11 @@ taskloop: stencil-vector-taskloop transpose-vector-taskloop nstream-vector-taskl opencl: p2p-innerloop-opencl stencil-opencl transpose-opencl nstream-opencl -sycl: p2p-hyperplane-sycl stencil-sycl transpose-sycl nstream-sycl transpose-explicit-sycl nstream-explicit-sycl +sycl: nstream-sycl p2p-hyperplane-sycl stencil-sycl transpose-sycl + +sycl-usm: nstream-sycl-usm stencil-sycl-usm transpose-sycl-usm + +sycl-explicit: nstream-sycl-explicit transpose-sycl-explicit tbb: p2p-innerloop-vector-tbb p2p-vector-tbb stencil-vector-tbb transpose-vector-tbb nstream-vector-tbb \ p2p-hyperplane-vector-tbb p2p-tasks-tbb @@ -150,11 +154,17 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-opencl: %-opencl.cc prk_util.h prk_opencl.h - $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ + $(SYCLCXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-sycl: %-sycl.cc prk_util.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ +%-sycl-usm: %-sycl-usm.cc prk_util.h + $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ + +%-sycl-explicit: %-sycl-explicit.cc prk_util.h + $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ + %-target: %-target.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(OMPFLAGS) $(TARGETFLAGS) -o $@ @@ -235,6 +245,8 @@ clean: -rm -f *-taskloop -rm -f *-opencl -rm -f *-sycl + -rm -f *-sycl-explicit + -rm -f *-sycl-usm -rm -f *-tbb -rm -f *-stl -rm -f *-pstl diff --git a/Cxx11/generate-sycl-stencil.py b/Cxx11/generate-sycl-stencil.py index d88cae37b..404b9edfc 100755 --- a/Cxx11/generate-sycl-stencil.py +++ b/Cxx11/generate-sycl-stencil.py @@ -5,31 +5,39 @@ import string import os -def codegen(src,pattern,stencil_size,radius,model,dim): +def codegen(src,pattern,stencil_size,radius,model,dim,usm): src.write('// declare the kernel name used in SYCL parallel_for\n') - src.write('template class '+pattern+str(radius)+'_'+str(dim)+'d;\n\n') + if (usm): + kernel_name = pattern+str(radius)+'_usm' + else: + kernel_name = pattern+str(radius)+'_'+str(dim)+'d' + src.write('template class '+kernel_name+';\n\n') src.write('template \n') - src.write('void '+pattern+str(radius)+'(cl::sycl::queue & q, const size_t n, ') - if (dim==2): - src.write('cl::sycl::buffer & d_in, ') - src.write('cl::sycl::buffer & d_out)\n') + src.write('void '+pattern+str(radius)+'(sycl::queue & q, const size_t n, ') + if (usm): + src.write('const T * in, ') + src.write('T * out)\n') + elif (dim==2): + src.write('sycl::buffer & d_in, ') + src.write('sycl::buffer & d_out)\n') else: - src.write('cl::sycl::buffer & d_in, ') - src.write('cl::sycl::buffer & d_out)\n') + src.write('sycl::buffer & d_in, ') + src.write('sycl::buffer & d_out)\n') src.write('{\n') - src.write(' q.submit([&](cl::sycl::handler& h) {\n') - src.write(' auto in = d_in.template get_access(h);\n') - src.write(' auto out = d_out.template get_access(h);\n') + src.write(' q.submit([&](sycl::handler& h) {\n') + if (not usm): + src.write(' auto in = d_in.template get_access(h);\n') + src.write(' auto out = d_out.template get_access(h);\n') if (dim==2): for r in range(1,radius+1): - src.write(' cl::sycl::id<2> dx'+str(r)+'(cl::sycl::range<2> {'+str(r)+',0});\n') - src.write(' cl::sycl::id<2> dy'+str(r)+'(cl::sycl::range<2> {0,'+str(r)+'});\n') - src.write(' h.parallel_for>(') - src.write('cl::sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') - src.write('cl::sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') - src.write('[=] (cl::sycl::item<2> it) {\n') + src.write(' sycl::id<2> dx'+str(r)+'(sycl::range<2> {'+str(r)+',0});\n') + src.write(' sycl::id<2> dy'+str(r)+'(sycl::range<2> {0,'+str(r)+'});\n') + src.write(' h.parallel_for>(') + src.write('sycl::range<2> {n-'+str(2*radius)+',n-'+str(2*radius)+'}, ') + src.write('sycl::id<2> {'+str(radius)+','+str(radius)+'}, ') + src.write('[=] (sycl::item<2> it) {\n') if (dim==2): - src.write(' cl::sycl::id<2> xy = it.get_id();\n') + src.write(' sycl::id<2> xy = it.get_id();\n') src.write(' out[xy] += ') else: # 1D indexing the slow way @@ -37,7 +45,9 @@ def codegen(src,pattern,stencil_size,radius,model,dim): #src.write(' auto j = it[1];\n') #src.write(' out[i*n+j] += ') # 1D indexing the fast way - src.write(' out[it[0]*n+it[1]] += ') + src.write(' const auto i = it[0];\n') + src.write(' const auto j = it[1];\n') + src.write(' out[i*n+j] += ') if pattern == 'star': for i in range(1,radius+1): if (dim==2): @@ -67,13 +77,13 @@ def codegen(src,pattern,stencil_size,radius,model,dim): if i > 1: src.write('\n') src.write(30*' ') - src.write('+in[it[0]*n+(it[1]+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') + src.write('+in[i*n+(j+'+str(i)+')] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[it[0]*n+(it[1]-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') + src.write('+in[i*n+(j-'+str(i)+')] * static_cast('+str(-1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[(it[0]+'+str(i)+')*n+it[1]] * static_cast('+str(+1./(2.*i*radius))+')') + src.write('+in[(i+'+str(i)+')*n+j] * static_cast('+str(+1./(2.*i*radius))+')') src.write('\n'+30*' ') - src.write('+in[(it[0]-'+str(i)+')*n+it[1]] * static_cast('+str(-1./(2.*i*radius))+')') + src.write('+in[(i-'+str(i)+')*n+j] * static_cast('+str(-1./(2.*i*radius))+')') if i == radius: src.write(';\n') else: @@ -87,8 +97,9 @@ def instance(src,model,pattern,r): stencil_size = 4*r+1 else: stencil_size = (2*r+1)**2 - codegen(src,pattern,stencil_size,r,model,1) - codegen(src,pattern,stencil_size,r,model,2) + codegen(src,pattern,stencil_size,r,model,1,False) + codegen(src,pattern,stencil_size,r,model,2,False) + codegen(src,pattern,stencil_size,r,model,1,True) def main(): for model in ['sycl']: diff --git a/Cxx11/nstream-explicit-sycl.cc b/Cxx11/nstream-sycl-explicit.cc similarity index 61% rename from Cxx11/nstream-explicit-sycl.cc rename to Cxx11/nstream-sycl-explicit.cc index 5201a48b1..ef2a0392b 100644 --- a/Cxx11/nstream-explicit-sycl.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -62,18 +62,13 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class nstream; template -void run(cl::sycl::queue & q, int iterations, size_t length) +void run(sycl::queue & q, int iterations, size_t length) { ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -87,25 +82,28 @@ void run(cl::sycl::queue & q, int iterations, size_t length) try { + auto ctx = q.get_context(); + auto dev = q.get_device(); + #if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); + sycl::program kernel(ctx); kernel.build_with_kernel_type>(); #endif - cl::sycl::buffer d_A { cl::sycl::range<1>{length} }; - cl::sycl::buffer d_B { cl::sycl::range<1>{length} }; - cl::sycl::buffer d_C { cl::sycl::range<1>{length} }; + sycl::buffer d_A { sycl::range<1>{length} }; + sycl::buffer d_B { sycl::range<1>{length} }; + sycl::buffer d_C { sycl::range<1>{length} }; - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + q.submit([&](sycl::handler& h) { + sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(A,(T)0); }); - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + q.submit([&](sycl::handler& h) { + sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(B,(T)2); }); - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + q.submit([&](sycl::handler& h) { + sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); h.fill(C,(T)2); }); q.wait(); @@ -114,17 +112,18 @@ void run(cl::sycl::queue & q, int iterations, size_t length) if (iter==1) nstream_time = prk::wtime(); - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); - cl::sycl::accessor C(d_C, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::range<1>(length), sycl::id<1>(0)); + sycl::accessor C(d_C, h, sycl::range<1>(length), sycl::id<1>(0)); h.parallel_for>( #if PREBUILD_KERNEL kernel.get_kernel>(), #endif - cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + sycl::range<1>{length}, [=] (sycl::id<1> it) { + const size_t i = it[0]; A[i] += B[i] + scalar * C[i]; }); }); @@ -136,21 +135,15 @@ void run(cl::sycl::queue & q, int iterations, size_t length) // for other device-oriented programming models. nstream_time = prk::wtime() - nstream_time; - q.submit([&](cl::sycl::handler& h) { - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(length), cl::sycl::id<1>(0)); + q.submit([&](sycl::handler& h) { + sycl::accessor A(d_A, h, sycl::range<1>(length), sycl::id<1>(0)); h.copy(A,h_A.data()); }); q.wait(); } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -247,15 +240,10 @@ int main(int argc, char * argv[]) try { #if SYCL_TRY_CPU_QUEUE if (length<100000) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, length); - run(host, iterations, length); + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, length); + run(q, iterations, length); } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } @@ -264,68 +252,39 @@ int main(int argc, char * argv[]) // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, length); - run(cpu, iterations, length); + run(q, iterations, length); + run(q, iterations, length); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE if (1) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, length); - if (has_fp64) { - run(gpu, iterations, length); - } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, length); + if (has_spir || has_ptx) { + run(q, iterations, length); if (has_fp64) { - run(gpu, iterations, length); + run(q, iterations, length); } -#endif } } #endif } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index 1aed0931c..c92a52bc9 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -62,16 +62,9 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" -namespace sycl = cl::sycl; - -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class nstream; template @@ -85,10 +78,6 @@ void run(sycl::queue & q, int iterations, size_t length) const T scalar(3); - //std::vector h_A(length,0); - //std::vector h_B(length,2); - //std::vector h_C(length,2); - T * A; T * B; T * C; @@ -103,10 +92,6 @@ void run(sycl::queue & q, int iterations, size_t length) kernel.build_with_kernel_type>(); #endif - //sycl::buffer d_A { h_A.data(), sycl::range<1>(h_A.size()) }; - //sycl::buffer d_B { h_B.data(), sycl::range<1>(h_B.size()) }; - //sycl::buffer d_C { h_C.data(), sycl::range<1>(h_C.size()) }; - A = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); B = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); C = static_cast(sycl::malloc_shared(length * sizeof(T), dev, ctx)); @@ -122,11 +107,6 @@ void run(sycl::queue & q, int iterations, size_t length) if (iter==1) nstream_time = prk::wtime(); q.submit([&](sycl::handler& h) { - - //auto A = d_A.template get_access(h); - //auto B = d_B.template get_access(h); - //auto C = d_C.template get_access(h); - h.parallel_for>( #if PREBUILD_KERNEL kernel.get_kernel>(), @@ -151,13 +131,7 @@ void run(sycl::queue & q, int iterations, size_t length) } catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -254,15 +228,10 @@ int main(int argc, char * argv[]) try { #if SYCL_TRY_CPU_QUEUE if (length<100000) { - sycl::queue host(sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, length); - run(host, iterations, length); + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, length); + run(q, iterations, length); } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } @@ -271,67 +240,39 @@ int main(int argc, char * argv[]) // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - sycl::queue cpu(sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, length); - run(cpu, iterations, length); + run(q, iterations, length); + run(q, iterations, length); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE if (1) { - sycl::queue gpu(sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, length); + if (has_spir || has_ptx) { + run(q, iterations, length); if (has_fp64) { - run(gpu, iterations, length); + run(q, iterations, length); } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, length); - if (has_fp64) { - run(gpu, iterations, length); - } -#endif } } #endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index 2657f7200..bc52e6649 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -62,18 +62,13 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class nstream; template -void run(cl::sycl::queue & q, int iterations, size_t length) +void run(sycl::queue & q, int iterations, size_t length) { ////////////////////////////////////////////////////////////////////// // Allocate space and perform the computation @@ -89,30 +84,33 @@ void run(cl::sycl::queue & q, int iterations, size_t length) try { + auto ctx = q.get_context(); + #if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); + sycl::program kernel(ctx); kernel.build_with_kernel_type>(); #endif - cl::sycl::buffer d_A { h_A.data(), cl::sycl::range<1>(h_A.size()) }; - cl::sycl::buffer d_B { h_B.data(), cl::sycl::range<1>(h_B.size()) }; - cl::sycl::buffer d_C { h_C.data(), cl::sycl::range<1>(h_C.size()) }; + sycl::buffer d_A { h_A.data(), sycl::range<1>(h_A.size()) }; + sycl::buffer d_B { h_B.data(), sycl::range<1>(h_B.size()) }; + sycl::buffer d_C { h_C.data(), sycl::range<1>(h_C.size()) }; for (int iter = 0; iter<=iterations; ++iter) { if (iter==1) nstream_time = prk::wtime(); - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); - auto C = d_C.template get_access(h); + auto A = d_A.template get_access(h); + auto B = d_B.template get_access(h); + auto C = d_C.template get_access(h); h.parallel_for>( #if PREBUILD_KERNEL kernel.get_kernel>(), #endif - cl::sycl::range<1>{length}, [=] (cl::sycl::item<1> i) { + sycl::range<1>{length}, [=] (sycl::id<1> it) { + const size_t i = it[0]; A[i] += B[i] + scalar * C[i]; }); }); @@ -124,15 +122,9 @@ void run(cl::sycl::queue & q, int iterations, size_t length) // for other device-oriented programming models. nstream_time = prk::wtime() - nstream_time; } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -229,15 +221,10 @@ int main(int argc, char * argv[]) try { #if SYCL_TRY_CPU_QUEUE if (length<100000) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, length); - run(host, iterations, length); + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, length); + run(q, iterations, length); } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } @@ -246,67 +233,39 @@ int main(int argc, char * argv[]) // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, length); - run(cpu, iterations, length); + run(q, iterations, length); + run(q, iterations, length); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE if (1) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, length); - if (has_fp64) { - run(gpu, iterations, length); - } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, length); + if (has_spir || has_ptx) { + run(q, iterations, length); if (has_fp64) { - run(gpu, iterations, length); + run(q, iterations, length); } -#endif } } #endif } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/Cxx11/p2p-hyperplane-sycl.cc b/Cxx11/p2p-hyperplane-sycl.cc index a738beffa..05e3adeb5 100644 --- a/Cxx11/p2p-hyperplane-sycl.cc +++ b/Cxx11/p2p-hyperplane-sycl.cc @@ -59,8 +59,7 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" - +#include "prk_sycl.h" #include "prk_util.h" #include "p2p-kernel.h" @@ -131,9 +130,9 @@ int main(int argc, char* argv[]) h_grid[j*n+0] = static_cast(j); } - cl::sycl::queue q; + sycl::queue q; { - cl::sycl::buffer d_grid { h_grid.data(), h_grid.size() }; + sycl::buffer d_grid { h_grid.data(), h_grid.size() }; for (auto iter = 0; iter<=iterations; iter++) { @@ -141,36 +140,36 @@ int main(int argc, char* argv[]) for (int i=2; i<=2*n-2; i++) { - cl::sycl::id<1> I{unsigned(i)}; - cl::sycl::id<1> One{1}; + sycl::id<1> I{unsigned(i)}; + sycl::id<1> One{1}; - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + auto grid = d_grid.get_access(h); unsigned begin = std::max(2,i-n+2); unsigned end = std::min(i,n)+1; unsigned range = end-begin; - h.parallel_for(cl::sycl::range<1>{range}, cl::sycl::id<1>{begin}, [=] (cl::sycl::item<1> j) { + h.parallel_for(sycl::range<1>{range}, sycl::id<1>{begin}, [=] (sycl::item<1> j) { auto J = j.get_id(); - cl::sycl::id<1> N{unsigned(n)}; - cl::sycl::id<1> X{I-J+One}; - cl::sycl::id<1> Y{J-One}; - cl::sycl::id<1> Xold{X-One}; // x-1 - cl::sycl::id<1> Yold{Y-One}; // y-1 - cl::sycl::id<1> index0{X*N+Y}; - cl::sycl::id<1> index1{Xold*N+Y}; - cl::sycl::id<1> index2{X*N+Yold}; - cl::sycl::id<1> index3{Xold*N+Yold}; + sycl::id<1> N{unsigned(n)}; + sycl::id<1> X{I-J+One}; + sycl::id<1> Y{J-One}; + sycl::id<1> Xold{X-One}; // x-1 + sycl::id<1> Yold{Y-One}; // y-1 + sycl::id<1> index0{X*N+Y}; + sycl::id<1> index1{Xold*N+Y}; + sycl::id<1> index2{X*N+Yold}; + sycl::id<1> index3{Xold*N+Yold}; grid[index0] = grid[index1] + grid[index2] - grid[index3]; }); }); q.wait(); } - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { - auto grid = d_grid.get_access(h); + auto grid = d_grid.get_access(h); h.single_task([=] { grid[0*n+0] = -grid[(n-1)*n+(n-1)]; diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h new file mode 100644 index 000000000..cdd18d211 --- /dev/null +++ b/Cxx11/prk_sycl.h @@ -0,0 +1,104 @@ +#ifndef PRK_SYCL_HPP +#define PRK_SYCL_HPP + +#include +#include + +#include "CL/sycl.hpp" + +namespace sycl = cl::sycl; + +// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL +#if defined(TRISYCL) || defined(__HIPSYCL__) +#define PREBUILD_KERNEL 0 +#else +#define PREBUILD_KERNEL 1 +#endif + +// not all SYCL implementations may support all device types. +// If an implementation does not find any devices based on a +// device selector, it will throw an exception. +// These macros can be used to check if there's any chance +// of an implementation targeting a CPU and GPU. +#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU) +#define SYCL_TRY_CPU_QUEUE 1 +#else +#define SYCL_TRY_CPU_QUEUE 0 +#endif + +// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) ) +#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU) +#define SYCL_TRY_GPU_QUEUE 1 +#else +#define SYCL_TRY_GPU_QUEUE 0 +#endif + +#if 0 +#include "prk_opencl.h" +#define USE_OPENCL 1 +#endif + +namespace prk { + + // There seems to be an issue with the clang CUDA/HIP toolchains not having + // std::abort() available + void abort(void) { +#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) + abort(); +#else + std::abort(); +#endif + } + + namespace SYCL { + + void print_device_platform(const sycl::queue & q) { +#if !defined(TRISYCL) && !defined(__HIPSYCL__) + auto device = q.get_device(); + auto platform = device.get_platform(); + std::cout << "SYCL Device: " << device.get_info() << std::endl; + std::cout << "SYCL Platform: " << platform.get_info() << std::endl; +#endif + } + + bool has_spir(const sycl::queue & q) { +#if !defined(TRISYCL) && !defined(__HIPSYCL__) + auto device = q.get_device(); + return device.has_extension(sycl::string_class("cl_khr_spir")); +#else + return true; +#endif + } + + bool has_ptx(const sycl::queue & q) { +#ifdef __COMPUTECPP__ + return true; +#else + return false; +#endif + } + + bool has_fp64(const sycl::queue & q) { +#if !defined(TRISYCL) && !defined(__HIPSYCL__) + auto device = q.get_device(); + return device.has_extension(sycl::string_class("cl_khr_fp64")); +#else + return true; +#endif + } + + void print_exception_details(sycl::exception & e) { +#ifdef __COMPUTECPP__ + std::cout << e.get_file_name() << std::endl; + std::cout << e.get_line_number() << std::endl; + std::cout << e.get_description() << std::endl; + std::cout << e.get_cl_error_message() << std::endl; + std::cout << e.get_cl_code() << std::endl; +#endif + } + + } // namespace SYCL + +} // namespace prk + +#endif // PRK_SYCL_HPP diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index 7feaec93d..abdf6388d 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -82,35 +82,6 @@ #define PRK_UNUSED #endif - -// for SYCL - -// prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL -#if defined(TRISYCL) || defined(__HIPSYCL__) -#define PREBUILD_KERNEL 0 -#else -#define PREBUILD_KERNEL 1 -#endif - -// not all SYCL implementations may support all device types. -// If an implementation does not find any devices based on a -// device selector, it will throw an exception. -// These macros can be used to check if there's any chance -// of an implementation targeting a CPU and GPU. -#if !defined(__HIPSYCL__) || defined(HIPSYCL_PLATFORM_CPU) -#define SYCL_TRY_CPU_QUEUE 1 -#else -#define SYCL_TRY_CPU_QUEUE 0 -#endif - -// !defined(HIPSYCL_PLATFORM_CPU) = !( defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) ) -#if !defined(__HIPSYCL__) || !defined(HIPSYCL_PLATFORM_CPU) -#define SYCL_TRY_GPU_QUEUE 1 -#else -#define SYCL_TRY_GPU_QUEUE 0 -#endif - - namespace prk { int get_alignment(void) diff --git a/Cxx11/stencil-sycl-usm.cc b/Cxx11/stencil-sycl-usm.cc new file mode 100644 index 000000000..1689841c8 --- /dev/null +++ b/Cxx11/stencil-sycl-usm.cc @@ -0,0 +1,341 @@ + +/// +/// Copyright (c) 2017, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: Stencil +/// +/// PURPOSE: This program tests the efficiency with which a space-invariant, +/// linear, symmetric filter (stencil) can be applied to a square +/// grid or image. +/// +/// USAGE: The program takes as input the linear +/// dimension of the grid, and the number of iterations on the grid +/// +/// +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than standard C functions, the following functions are used in +/// this program: +/// wtime() +/// +/// HISTORY: - Written by Rob Van der Wijngaart, February 2009. +/// - RvdW: Removed unrolling pragmas for clarity; +/// added constant to array "in" at end of each iteration to force +/// refreshing of neighbor data in parallel versions; August 2013 +/// C++11-ification by Jeff Hammond, May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_sycl.h" +#include "prk_util.h" +#include "stencil_sycl.hpp" + +template class init; +template class add; + +template +void nothing(sycl::queue & q, const size_t n, const T * in, T *out) +{ + std::cout << "You are trying to use a stencil that does not exist.\n"; + std::cout << "Please generate the new stencil using the code generator\n"; + std::cout << "and add it to the case-switch in the driver." << std::endl; + prk::abort(); +} + +template +void run(sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius) +{ + auto stencil = nothing; + if (star) { + switch (radius) { + case 1: stencil = star1; break; + case 2: stencil = star2; break; + case 3: stencil = star3; break; + case 4: stencil = star4; break; + case 5: stencil = star5; break; + } + } +#if 0 + else { + switch (radius) { + case 1: stencil = grid1; break; + case 2: stencil = grid2; break; + case 3: stencil = grid3; break; + case 4: stencil = grid4; break; + case 5: stencil = grid5; break; + } + } +#endif + + ////////////////////////////////////////////////////////////////////// + // Allocate space and perform the computation + ////////////////////////////////////////////////////////////////////// + + double stencil_time(0); + + T * in; + T * out; + + auto ctx = q.get_context(); + auto dev = q.get_device(); + + try { + + in = static_cast(sycl::malloc_shared(n * n * sizeof(T), dev, ctx)); + out = static_cast(sycl::malloc_shared(n * n * sizeof(T), dev, ctx)); + + q.submit([&](sycl::handler& h) { + + h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::id<2> it) { + const auto i = it[0]; + const auto j = it[1]; + in[i*n+j] = static_cast(i+j); + }); + }); + q.wait(); + + for (int iter = 0; iter<=iterations; iter++) { + + if (iter==1) stencil_time = prk::wtime(); + + stencil(q, n, in, out); + + q.submit([&](sycl::handler& h) { + // Add constant to solution to force refresh of neighbor data, if any + h.parallel_for>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, [=] (sycl::id<2> it) { + const auto i = it[0]; + const auto j = it[1]; + in[i*n+j] += static_cast(1); + }); + }); + q.wait(); + } + stencil_time = prk::wtime() - stencil_time; + + sycl::free(in, ctx); + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + return; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return; + } + catch (const char * e) { + std::cout << e << std::endl; + return; + } + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // interior of grid with respect to stencil + auto active_points = (n-2L*radius)*(n-2L*radius); + + // compute L1 norm in parallel + double norm(0); + for (int i=radius; i epsilon) { + std::cout << "ERROR: L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; + } else { + std::cout << "Solution validates" << std::endl; +#ifdef VERBOSE + std::cout << "L1 norm = " << norm + << " Reference L1 norm = " << reference_norm << std::endl; +#endif + const size_t stencil_size = star ? 4*radius+1 : (2*radius+1)*(2*radius+1); + size_t flops = (2L*stencil_size+1L) * active_points; + double avgtime = stencil_time/iterations; + std::cout << 8*sizeof(T) << "B " + << "Rate (MFlops/s): " << 1.0e-6 * static_cast(flops)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Stencil execution on 2D grid" << std::endl; + + ////////////////////////////////////////////////////////////////////// + // Process and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t n, tile_size; + bool star = true; + size_t radius = 2; + try { + if (argc < 3) { + throw "Usage: <# iterations> [ ]"; + } + + // number of times to run the algorithm + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // linear grid dimension + n = std::atoi(argv[2]); + if (n < 1) { + throw "ERROR: grid dimension must be positive"; + } else if (n > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: grid dimension too large - overflow risk"; + } + + // default tile size for tiling of local transpose + tile_size = 32; + if (argc > 3) { + tile_size = std::atoi(argv[3]); + if (tile_size <= 0) tile_size = n; + if (tile_size > n) tile_size = n; + } + + // stencil pattern + if (argc > 4) { + auto stencil = std::string(argv[4]); + auto grid = std::string("grid"); + star = (stencil == grid) ? false : true; + } + + // stencil radius + radius = 2; + if (argc > 5) { + radius = std::atoi(argv[5]); + } + + if ( (radius < 1) || (2*radius+1 > n) ) { + throw "ERROR: Stencil radius negative or too large"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Grid size = " << n << std::endl; + std::cout << "Type of stencil = " << (star ? "star" : "grid") << std::endl; + std::cout << "Radius of stencil = " << radius << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + + try { +#if SYCL_TRY_CPU_QUEUE + if (n<10000) { + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; + } +#endif + + // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE + if (1) { + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + if (has_spir) { + run(q, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); + } + } +#endif + + // NVIDIA GPU requires ptx64 target +#if SYCL_TRY_GPU_QUEUE + if (1) { + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir || has_ptx) { + run(q, iterations, n, tile_size, star, radius); + if (has_fp64) { + run(q, iterations, n, tile_size, star, radius); + } + } + } +#endif + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + return 1; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/stencil-sycl.cc b/Cxx11/stencil-sycl.cc index b333c4194..949e4d632 100644 --- a/Cxx11/stencil-sycl.cc +++ b/Cxx11/stencil-sycl.cc @@ -60,41 +60,29 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" #include "stencil_sycl.hpp" - -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class init; template class add; #if USE_2D_INDEXING template -void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void nothing(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) #else template -void nothing(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void nothing(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) #endif { std::cout << "You are trying to use a stencil that does not exist.\n"; std::cout << "Please generate the new stencil using the code generator\n"; std::cout << "and add it to the case-switch in the driver." << std::endl; - // There seems to be an issue with the clang CUDA/HIP toolchains not having - // std::abort() available -#if defined(HIPSYCL_PLATFORM_CUDA) || defined(HIPSYCL_PLATFORM_HCC) - abort(); -#else - std::abort(); -#endif + prk::abort(); } template -void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius) +void run(sycl::queue & q, int iterations, size_t n, size_t tile_size, bool star, size_t radius) { auto stencil = nothing; if (star) { @@ -131,23 +119,23 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s // initialize device buffers from host buffers #if USE_2D_INDEXING - cl::sycl::buffer d_in { cl::sycl::range<2> {n, n} }; - cl::sycl::buffer d_out { h_out.data(), cl::sycl::range<2> {n, n} }; + sycl::buffer d_in { sycl::range<2> {n, n} }; + sycl::buffer d_out { h_out.data(), sycl::range<2> {n, n} }; #else // FIXME: if I don't initialize this buffer from host, the results are wrong. Why? - //cl::sycl::buffer d_in { cl::sycl::range<1> {n*n} }; - cl::sycl::buffer d_in { h_in.data(), h_in.size() }; - cl::sycl::buffer d_out { h_out.data(), h_out.size() }; + //sycl::buffer d_in { sycl::range<1> {n*n} }; + sycl::buffer d_in { h_in.data(), h_in.size() }; + sycl::buffer d_out { h_out.data(), h_out.size() }; #endif - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { // accessor methods - auto in = d_in.template get_access(h); + auto in = d_in.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n, n}, [=] (cl::sycl::item<2> it) { + h.parallel_for>(sycl::range<2> {n, n}, [=] (sycl::item<2> it) { #if USE_2D_INDEXING - cl::sycl::id<2> xy = it.get_id(); + sycl::id<2> xy = it.get_id(); auto i = it[0]; auto j = it[1]; in[xy] = static_cast(i+j); @@ -160,7 +148,7 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s }); q.wait(); - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) stencil_time = prk::wtime(); @@ -169,16 +157,16 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s q.wait(); #endif - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { // accessor methods - auto in = d_in.template get_access(h); + auto in = d_in.template get_access(h); // Add constant to solution to force refresh of neighbor data, if any - h.parallel_for>(cl::sycl::range<2> {n, n}, cl::sycl::id<2> {0, 0}, - [=] (cl::sycl::item<2> it) { + h.parallel_for>(sycl::range<2> {n, n}, sycl::id<2> {0, 0}, + [=] (sycl::item<2> it) { #if USE_2D_INDEXING - cl::sycl::id<2> xy = it.get_id(); + sycl::id<2> xy = it.get_id(); in[xy] += static_cast(1); #else #if 0 // This is noticeably slower :-( @@ -195,15 +183,9 @@ void run(cl::sycl::queue & q, int iterations, size_t n, size_t tile_size, bool s } stencil_time = prk::wtime() - stencil_time; } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -328,87 +310,53 @@ int main(int argc, char * argv[]) #endif try { - #if SYCL_TRY_CPU_QUEUE - if (1) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - - run(host, iterations, n, tile_size, star, radius); - run(host, iterations, n, tile_size, star, radius); + if (n<10000) { + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } #endif // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, n, tile_size, star, radius); - run(cpu, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE - if (0) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + if (1) { + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, n, tile_size, star, radius); - if (has_fp64) { - run(gpu, iterations, n, tile_size, star, radius); - } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, n, tile_size, star, radius); + if (has_spir || has_ptx) { + run(q, iterations, n, tile_size, star, radius); if (has_fp64) { - run(gpu, iterations, n, tile_size, star, radius); + run(q, iterations, n, tile_size, star, radius); } -#endif } } #endif } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/Cxx11/stencil_sycl.hpp b/Cxx11/stencil_sycl.hpp index 41412e5b4..de3cde61b 100644 --- a/Cxx11/stencil_sycl.hpp +++ b/Cxx11/stencil_sycl.hpp @@ -2,16 +2,18 @@ template class star1_1d; template -void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.5) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.5) - +in[(it[0]+1)*n+it[1]] * static_cast(0.5) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.5); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.5) + +in[i*n+(j-1)] * static_cast(-0.5) + +in[(i+1)*n+j] * static_cast(0.5) + +in[(i-1)*n+j] * static_cast(-0.5); }); }); } @@ -20,15 +22,15 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: template class star1_2d; template -void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star1(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - h.parallel_for>(cl::sycl::range<2> {n-2,n-2}, cl::sycl::id<2> {1,1}, [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + sycl::id<2> dx1(sycl::range<2> {1,0}); + sycl::id<2> dy1(sycl::range<2> {0,1}); + h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.5) +in[xy-dx1] * static_cast(-0.5) +in[xy+dy1] * static_cast(0.5) @@ -37,24 +39,44 @@ void star1(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star1_usm; + +template +void star1(sycl::queue & q, const size_t n, const T * in, T * out) +{ + q.submit([&](sycl::handler& h) { + h.parallel_for>(sycl::range<2> {n-2,n-2}, sycl::id<2> {1,1}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.5) + +in[i*n+(j-1)] * static_cast(-0.5) + +in[(i+1)*n+j] * static_cast(0.5) + +in[(i-1)*n+j] * static_cast(-0.5); + }); + }); +} + // declare the kernel name used in SYCL parallel_for template class star2_1d; template -void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.25) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.25) - +in[(it[0]+1)*n+it[1]] * static_cast(0.25) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.25) - +in[it[0]*n+(it[1]+2)] * static_cast(0.125) - +in[it[0]*n+(it[1]-2)] * static_cast(-0.125) - +in[(it[0]+2)*n+it[1]] * static_cast(0.125) - +in[(it[0]-2)*n+it[1]] * static_cast(-0.125); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.25) + +in[i*n+(j-1)] * static_cast(-0.25) + +in[(i+1)*n+j] * static_cast(0.25) + +in[(i-1)*n+j] * static_cast(-0.25) + +in[i*n+(j+2)] * static_cast(0.125) + +in[i*n+(j-2)] * static_cast(-0.125) + +in[(i+2)*n+j] * static_cast(0.125) + +in[(i-2)*n+j] * static_cast(-0.125); }); }); } @@ -63,17 +85,17 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: template class star2_2d; template -void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star2(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - h.parallel_for>(cl::sycl::range<2> {n-4,n-4}, cl::sycl::id<2> {2,2}, [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + sycl::id<2> dx1(sycl::range<2> {1,0}); + sycl::id<2> dy1(sycl::range<2> {0,1}); + sycl::id<2> dx2(sycl::range<2> {2,0}); + sycl::id<2> dy2(sycl::range<2> {0,2}); + h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.25) +in[xy-dx1] * static_cast(-0.25) +in[xy+dy1] * static_cast(0.25) @@ -86,28 +108,52 @@ void star2(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star2_usm; + +template +void star2(sycl::queue & q, const size_t n, const T * in, T * out) +{ + q.submit([&](sycl::handler& h) { + h.parallel_for>(sycl::range<2> {n-4,n-4}, sycl::id<2> {2,2}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.25) + +in[i*n+(j-1)] * static_cast(-0.25) + +in[(i+1)*n+j] * static_cast(0.25) + +in[(i-1)*n+j] * static_cast(-0.25) + +in[i*n+(j+2)] * static_cast(0.125) + +in[i*n+(j-2)] * static_cast(-0.125) + +in[(i+2)*n+j] * static_cast(0.125) + +in[(i-2)*n+j] * static_cast(-0.125); + }); + }); +} + // declare the kernel name used in SYCL parallel_for template class star3_1d; template -void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.16666666666666666) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.16666666666666666) - +in[(it[0]+1)*n+it[1]] * static_cast(0.16666666666666666) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.16666666666666666) - +in[it[0]*n+(it[1]+2)] * static_cast(0.08333333333333333) - +in[it[0]*n+(it[1]-2)] * static_cast(-0.08333333333333333) - +in[(it[0]+2)*n+it[1]] * static_cast(0.08333333333333333) - +in[(it[0]-2)*n+it[1]] * static_cast(-0.08333333333333333) - +in[it[0]*n+(it[1]+3)] * static_cast(0.05555555555555555) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.05555555555555555) - +in[(it[0]+3)*n+it[1]] * static_cast(0.05555555555555555) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.05555555555555555); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) + +in[i*n+(j-1)] * static_cast(-0.166666666667) + +in[(i+1)*n+j] * static_cast(0.166666666667) + +in[(i-1)*n+j] * static_cast(-0.166666666667) + +in[i*n+(j+2)] * static_cast(0.0833333333333) + +in[i*n+(j-2)] * static_cast(-0.0833333333333) + +in[(i+2)*n+j] * static_cast(0.0833333333333) + +in[(i-2)*n+j] * static_cast(-0.0833333333333) + +in[i*n+(j+3)] * static_cast(0.0555555555556) + +in[i*n+(j-3)] * static_cast(-0.0555555555556) + +in[(i+3)*n+j] * static_cast(0.0555555555556) + +in[(i-3)*n+j] * static_cast(-0.0555555555556); }); }); } @@ -116,31 +162,57 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: template class star3_2d; template -void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star3(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) +{ + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + sycl::id<2> dx1(sycl::range<2> {1,0}); + sycl::id<2> dy1(sycl::range<2> {0,1}); + sycl::id<2> dx2(sycl::range<2> {2,0}); + sycl::id<2> dy2(sycl::range<2> {0,2}); + sycl::id<2> dx3(sycl::range<2> {3,0}); + sycl::id<2> dy3(sycl::range<2> {0,3}); + h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id(); + out[xy] += +in[xy+dx1] * static_cast(0.166666666667) + +in[xy-dx1] * static_cast(-0.166666666667) + +in[xy+dy1] * static_cast(0.166666666667) + +in[xy-dy1] * static_cast(-0.166666666667) + +in[xy+dx2] * static_cast(0.0833333333333) + +in[xy-dx2] * static_cast(-0.0833333333333) + +in[xy+dy2] * static_cast(0.0833333333333) + +in[xy-dy2] * static_cast(-0.0833333333333) + +in[xy+dx3] * static_cast(0.0555555555556) + +in[xy-dx3] * static_cast(-0.0555555555556) + +in[xy+dy3] * static_cast(0.0555555555556) + +in[xy-dy3] * static_cast(-0.0555555555556); + }); + }); +} + +// declare the kernel name used in SYCL parallel_for +template class star3_usm; + +template +void star3(sycl::queue & q, const size_t n, const T * in, T * out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - h.parallel_for>(cl::sycl::range<2> {n-6,n-6}, cl::sycl::id<2> {3,3}, [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); - out[xy] += +in[xy+dx1] * static_cast(0.16666666666666666) - +in[xy-dx1] * static_cast(-0.16666666666666666) - +in[xy+dy1] * static_cast(0.16666666666666666) - +in[xy-dy1] * static_cast(-0.16666666666666666) - +in[xy+dx2] * static_cast(0.08333333333333333) - +in[xy-dx2] * static_cast(-0.08333333333333333) - +in[xy+dy2] * static_cast(0.08333333333333333) - +in[xy-dy2] * static_cast(-0.08333333333333333) - +in[xy+dx3] * static_cast(0.05555555555555555) - +in[xy-dx3] * static_cast(-0.05555555555555555) - +in[xy+dy3] * static_cast(0.05555555555555555) - +in[xy-dy3] * static_cast(-0.05555555555555555); + q.submit([&](sycl::handler& h) { + h.parallel_for>(sycl::range<2> {n-6,n-6}, sycl::id<2> {3,3}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.166666666667) + +in[i*n+(j-1)] * static_cast(-0.166666666667) + +in[(i+1)*n+j] * static_cast(0.166666666667) + +in[(i-1)*n+j] * static_cast(-0.166666666667) + +in[i*n+(j+2)] * static_cast(0.0833333333333) + +in[i*n+(j-2)] * static_cast(-0.0833333333333) + +in[(i+2)*n+j] * static_cast(0.0833333333333) + +in[(i-2)*n+j] * static_cast(-0.0833333333333) + +in[i*n+(j+3)] * static_cast(0.0555555555556) + +in[i*n+(j-3)] * static_cast(-0.0555555555556) + +in[(i+3)*n+j] * static_cast(0.0555555555556) + +in[(i-3)*n+j] * static_cast(-0.0555555555556); }); }); } @@ -149,28 +221,30 @@ void star3(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c template class star4_1d; template -void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.125) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.125) - +in[(it[0]+1)*n+it[1]] * static_cast(0.125) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.125) - +in[it[0]*n+(it[1]+2)] * static_cast(0.0625) - +in[it[0]*n+(it[1]-2)] * static_cast(-0.0625) - +in[(it[0]+2)*n+it[1]] * static_cast(0.0625) - +in[(it[0]-2)*n+it[1]] * static_cast(-0.0625) - +in[it[0]*n+(it[1]+3)] * static_cast(0.041666666666666664) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.041666666666666664) - +in[(it[0]+3)*n+it[1]] * static_cast(0.041666666666666664) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.041666666666666664) - +in[it[0]*n+(it[1]+4)] * static_cast(0.03125) - +in[it[0]*n+(it[1]-4)] * static_cast(-0.03125) - +in[(it[0]+4)*n+it[1]] * static_cast(0.03125) - +in[(it[0]-4)*n+it[1]] * static_cast(-0.03125); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.125) + +in[i*n+(j-1)] * static_cast(-0.125) + +in[(i+1)*n+j] * static_cast(0.125) + +in[(i-1)*n+j] * static_cast(-0.125) + +in[i*n+(j+2)] * static_cast(0.0625) + +in[i*n+(j-2)] * static_cast(-0.0625) + +in[(i+2)*n+j] * static_cast(0.0625) + +in[(i-2)*n+j] * static_cast(-0.0625) + +in[i*n+(j+3)] * static_cast(0.0416666666667) + +in[i*n+(j-3)] * static_cast(-0.0416666666667) + +in[(i+3)*n+j] * static_cast(0.0416666666667) + +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+4)] * static_cast(0.03125) + +in[i*n+(j-4)] * static_cast(-0.03125) + +in[(i+4)*n+j] * static_cast(0.03125) + +in[(i-4)*n+j] * static_cast(-0.03125); }); }); } @@ -179,21 +253,21 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: template class star4_2d; template -void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star4(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); - cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); - h.parallel_for>(cl::sycl::range<2> {n-8,n-8}, cl::sycl::id<2> {4,4}, [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + sycl::id<2> dx1(sycl::range<2> {1,0}); + sycl::id<2> dy1(sycl::range<2> {0,1}); + sycl::id<2> dx2(sycl::range<2> {2,0}); + sycl::id<2> dy2(sycl::range<2> {0,2}); + sycl::id<2> dx3(sycl::range<2> {3,0}); + sycl::id<2> dy3(sycl::range<2> {0,3}); + sycl::id<2> dx4(sycl::range<2> {4,0}); + sycl::id<2> dy4(sycl::range<2> {0,4}); + h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.125) +in[xy-dx1] * static_cast(-0.125) +in[xy+dy1] * static_cast(0.125) @@ -202,10 +276,10 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c +in[xy-dx2] * static_cast(-0.0625) +in[xy+dy2] * static_cast(0.0625) +in[xy-dy2] * static_cast(-0.0625) - +in[xy+dx3] * static_cast(0.041666666666666664) - +in[xy-dx3] * static_cast(-0.041666666666666664) - +in[xy+dy3] * static_cast(0.041666666666666664) - +in[xy-dy3] * static_cast(-0.041666666666666664) + +in[xy+dx3] * static_cast(0.0416666666667) + +in[xy-dx3] * static_cast(-0.0416666666667) + +in[xy+dy3] * static_cast(0.0416666666667) + +in[xy-dy3] * static_cast(-0.0416666666667) +in[xy+dx4] * static_cast(0.03125) +in[xy-dx4] * static_cast(-0.03125) +in[xy+dy4] * static_cast(0.03125) @@ -214,36 +288,68 @@ void star4(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star4_usm; + +template +void star4(sycl::queue & q, const size_t n, const T * in, T * out) +{ + q.submit([&](sycl::handler& h) { + h.parallel_for>(sycl::range<2> {n-8,n-8}, sycl::id<2> {4,4}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.125) + +in[i*n+(j-1)] * static_cast(-0.125) + +in[(i+1)*n+j] * static_cast(0.125) + +in[(i-1)*n+j] * static_cast(-0.125) + +in[i*n+(j+2)] * static_cast(0.0625) + +in[i*n+(j-2)] * static_cast(-0.0625) + +in[(i+2)*n+j] * static_cast(0.0625) + +in[(i-2)*n+j] * static_cast(-0.0625) + +in[i*n+(j+3)] * static_cast(0.0416666666667) + +in[i*n+(j-3)] * static_cast(-0.0416666666667) + +in[(i+3)*n+j] * static_cast(0.0416666666667) + +in[(i-3)*n+j] * static_cast(-0.0416666666667) + +in[i*n+(j+4)] * static_cast(0.03125) + +in[i*n+(j-4)] * static_cast(-0.03125) + +in[(i+4)*n+j] * static_cast(0.03125) + +in[(i-4)*n+j] * static_cast(-0.03125); + }); + }); +} + // declare the kernel name used in SYCL parallel_for template class star5_1d; template -void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - h.parallel_for>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { - out[it[0]*n+it[1]] += +in[it[0]*n+(it[1]+1)] * static_cast(0.1) - +in[it[0]*n+(it[1]-1)] * static_cast(-0.1) - +in[(it[0]+1)*n+it[1]] * static_cast(0.1) - +in[(it[0]-1)*n+it[1]] * static_cast(-0.1) - +in[it[0]*n+(it[1]+2)] * static_cast(0.05) - +in[it[0]*n+(it[1]-2)] * static_cast(-0.05) - +in[(it[0]+2)*n+it[1]] * static_cast(0.05) - +in[(it[0]-2)*n+it[1]] * static_cast(-0.05) - +in[it[0]*n+(it[1]+3)] * static_cast(0.03333333333333333) - +in[it[0]*n+(it[1]-3)] * static_cast(-0.03333333333333333) - +in[(it[0]+3)*n+it[1]] * static_cast(0.03333333333333333) - +in[(it[0]-3)*n+it[1]] * static_cast(-0.03333333333333333) - +in[it[0]*n+(it[1]+4)] * static_cast(0.025) - +in[it[0]*n+(it[1]-4)] * static_cast(-0.025) - +in[(it[0]+4)*n+it[1]] * static_cast(0.025) - +in[(it[0]-4)*n+it[1]] * static_cast(-0.025) - +in[it[0]*n+(it[1]+5)] * static_cast(0.02) - +in[it[0]*n+(it[1]-5)] * static_cast(-0.02) - +in[(it[0]+5)*n+it[1]] * static_cast(0.02) - +in[(it[0]-5)*n+it[1]] * static_cast(-0.02); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.1) + +in[i*n+(j-1)] * static_cast(-0.1) + +in[(i+1)*n+j] * static_cast(0.1) + +in[(i-1)*n+j] * static_cast(-0.1) + +in[i*n+(j+2)] * static_cast(0.05) + +in[i*n+(j-2)] * static_cast(-0.05) + +in[(i+2)*n+j] * static_cast(0.05) + +in[(i-2)*n+j] * static_cast(-0.05) + +in[i*n+(j+3)] * static_cast(0.0333333333333) + +in[i*n+(j-3)] * static_cast(-0.0333333333333) + +in[(i+3)*n+j] * static_cast(0.0333333333333) + +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+4)] * static_cast(0.025) + +in[i*n+(j-4)] * static_cast(-0.025) + +in[(i+4)*n+j] * static_cast(0.025) + +in[(i-4)*n+j] * static_cast(-0.025) + +in[i*n+(j+5)] * static_cast(0.02) + +in[i*n+(j-5)] * static_cast(-0.02) + +in[(i+5)*n+j] * static_cast(0.02) + +in[(i-5)*n+j] * static_cast(-0.02); }); }); } @@ -252,23 +358,23 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl:: template class star5_2d; template -void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, cl::sycl::buffer & d_out) +void star5(sycl::queue & q, const size_t n, sycl::buffer & d_in, sycl::buffer & d_out) { - q.submit([&](cl::sycl::handler& h) { - auto in = d_in.template get_access(h); - auto out = d_out.template get_access(h); - cl::sycl::id<2> dx1(cl::sycl::range<2> {1,0}); - cl::sycl::id<2> dy1(cl::sycl::range<2> {0,1}); - cl::sycl::id<2> dx2(cl::sycl::range<2> {2,0}); - cl::sycl::id<2> dy2(cl::sycl::range<2> {0,2}); - cl::sycl::id<2> dx3(cl::sycl::range<2> {3,0}); - cl::sycl::id<2> dy3(cl::sycl::range<2> {0,3}); - cl::sycl::id<2> dx4(cl::sycl::range<2> {4,0}); - cl::sycl::id<2> dy4(cl::sycl::range<2> {0,4}); - cl::sycl::id<2> dx5(cl::sycl::range<2> {5,0}); - cl::sycl::id<2> dy5(cl::sycl::range<2> {0,5}); - h.parallel_for>(cl::sycl::range<2> {n-10,n-10}, cl::sycl::id<2> {5,5}, [=] (cl::sycl::item<2> it) { - cl::sycl::id<2> xy = it.get_id(); + q.submit([&](sycl::handler& h) { + auto in = d_in.template get_access(h); + auto out = d_out.template get_access(h); + sycl::id<2> dx1(sycl::range<2> {1,0}); + sycl::id<2> dy1(sycl::range<2> {0,1}); + sycl::id<2> dx2(sycl::range<2> {2,0}); + sycl::id<2> dy2(sycl::range<2> {0,2}); + sycl::id<2> dx3(sycl::range<2> {3,0}); + sycl::id<2> dy3(sycl::range<2> {0,3}); + sycl::id<2> dx4(sycl::range<2> {4,0}); + sycl::id<2> dy4(sycl::range<2> {0,4}); + sycl::id<2> dx5(sycl::range<2> {5,0}); + sycl::id<2> dy5(sycl::range<2> {0,5}); + h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { + sycl::id<2> xy = it.get_id(); out[xy] += +in[xy+dx1] * static_cast(0.1) +in[xy-dx1] * static_cast(-0.1) +in[xy+dy1] * static_cast(0.1) @@ -277,10 +383,10 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c +in[xy-dx2] * static_cast(-0.05) +in[xy+dy2] * static_cast(0.05) +in[xy-dy2] * static_cast(-0.05) - +in[xy+dx3] * static_cast(0.03333333333333333) - +in[xy-dx3] * static_cast(-0.03333333333333333) - +in[xy+dy3] * static_cast(0.03333333333333333) - +in[xy-dy3] * static_cast(-0.03333333333333333) + +in[xy+dx3] * static_cast(0.0333333333333) + +in[xy-dx3] * static_cast(-0.0333333333333) + +in[xy+dy3] * static_cast(0.0333333333333) + +in[xy-dy3] * static_cast(-0.0333333333333) +in[xy+dx4] * static_cast(0.025) +in[xy-dx4] * static_cast(-0.025) +in[xy+dy4] * static_cast(0.025) @@ -293,3 +399,37 @@ void star5(cl::sycl::queue & q, const size_t n, cl::sycl::buffer & d_in, c }); } +// declare the kernel name used in SYCL parallel_for +template class star5_usm; + +template +void star5(sycl::queue & q, const size_t n, const T * in, T * out) +{ + q.submit([&](sycl::handler& h) { + h.parallel_for>(sycl::range<2> {n-10,n-10}, sycl::id<2> {5,5}, [=] (sycl::item<2> it) { + const auto i = it[0]; + const auto j = it[1]; + out[i*n+j] += +in[i*n+(j+1)] * static_cast(0.1) + +in[i*n+(j-1)] * static_cast(-0.1) + +in[(i+1)*n+j] * static_cast(0.1) + +in[(i-1)*n+j] * static_cast(-0.1) + +in[i*n+(j+2)] * static_cast(0.05) + +in[i*n+(j-2)] * static_cast(-0.05) + +in[(i+2)*n+j] * static_cast(0.05) + +in[(i-2)*n+j] * static_cast(-0.05) + +in[i*n+(j+3)] * static_cast(0.0333333333333) + +in[i*n+(j-3)] * static_cast(-0.0333333333333) + +in[(i+3)*n+j] * static_cast(0.0333333333333) + +in[(i-3)*n+j] * static_cast(-0.0333333333333) + +in[i*n+(j+4)] * static_cast(0.025) + +in[i*n+(j-4)] * static_cast(-0.025) + +in[(i+4)*n+j] * static_cast(0.025) + +in[(i-4)*n+j] * static_cast(-0.025) + +in[i*n+(j+5)] * static_cast(0.02) + +in[i*n+(j-5)] * static_cast(-0.02) + +in[(i+5)*n+j] * static_cast(0.02) + +in[(i-5)*n+j] * static_cast(-0.02); + }); + }); +} + diff --git a/Cxx11/transpose-explicit-sycl.cc b/Cxx11/transpose-sycl-explicit.cc similarity index 56% rename from Cxx11/transpose-explicit-sycl.cc rename to Cxx11/transpose-sycl-explicit.cc index e92dfaa1f..a1dae3bc9 100644 --- a/Cxx11/transpose-explicit-sycl.cc +++ b/Cxx11/transpose-sycl-explicit.cc @@ -49,19 +49,14 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class iota; template class transpose; template -void run(cl::sycl::queue & q, int iterations, size_t order) +void run(sycl::queue & q, int iterations, size_t order) { ////////////////////////////////////////////////////////////////////// // Allocate space for the input and transpose matrix @@ -73,37 +68,39 @@ void run(cl::sycl::queue & q, int iterations, size_t order) try { + auto ctx = q.get_context(); + #if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); + sycl::program kernel(ctx); kernel.build_with_kernel_type>(); #endif #if USE_2D_INDEXING - cl::sycl::buffer d_A( cl::sycl::range<2>{order,order} ); - cl::sycl::buffer d_B( cl::sycl::range<2>{order,order} ); + sycl::buffer d_A( sycl::range<2>{order,order} ); + sycl::buffer d_B( sycl::range<2>{order,order} ); #else - cl::sycl::buffer d_A { cl::sycl::range<1>{order*order} }; - cl::sycl::buffer d_B { cl::sycl::range<1>{order*order} }; + sycl::buffer d_A { sycl::range<1>{order*order} }; + sycl::buffer d_B { sycl::range<1>{order*order} }; #endif - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { #if USE_2D_INDEXING - cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); - h.parallel_for>(cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> i) { + sycl::accessor A(d_A, h, sycl::range<2>(order,order), sycl::id<2>(0,0)); + h.parallel_for>(sycl::range<2>{order,order}, [=] (sycl::item<2> i) { A[i] = i[0] * order + i[1]; }); #else - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); - h.parallel_for>(cl::sycl::range<1>{order*order}, [=] (cl::sycl::item<1> i) { + sycl::accessor A(d_A, h, sycl::range<1>(order*order), sycl::id<1>(0)); + h.parallel_for>(sycl::range<1>{order*order}, [=] (sycl::item<1> i) { A[i] = i[0]; }); #endif }); - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { #if USE_2D_INDEXING - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + sycl::accessor B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0)); #else - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0)); #endif h.fill(B,(T)0); }); @@ -113,24 +110,24 @@ void run(cl::sycl::queue & q, int iterations, size_t order) if (iter==1) trans_time = prk::wtime(); - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { #if USE_2D_INDEXING - cl::sycl::accessor A(d_A, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + sycl::accessor A(d_A, h, sycl::range<2>(order,order), sycl::id<2>(0,0)); + sycl::accessor B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0)); #else - cl::sycl::accessor A(d_A, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + sycl::accessor A(d_A, h, sycl::range<1>(order*order), sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0)); #endif h.parallel_for>( #if PREBUILD_KERNEL kernel.get_kernel>(), #endif - cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { + sycl::range<2>{order,order}, [=] (sycl::item<2> it) { #if USE_2D_INDEXING - cl::sycl::id<2> ij{it[0],it[1]}; - cl::sycl::id<2> ji{it[1],it[0]}; + sycl::id<2> ij{it[0],it[1]}; + sycl::id<2> ji{it[1],it[0]}; B[ij] += A[ji]; A[ji] += (T)1; #else @@ -147,25 +144,19 @@ void run(cl::sycl::queue & q, int iterations, size_t order) // for other device-oriented programming models. trans_time = prk::wtime() - trans_time; - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { #if USE_2D_INDEXING - cl::sycl::accessor B(d_B, h, cl::sycl::range<2>(order,order), cl::sycl::id<2>(0,0)); + sycl::accessor B(d_B, h, sycl::range<2>(order,order), sycl::id<2>(0,0)); #else - cl::sycl::accessor B(d_B, h, cl::sycl::range<1>(order*order), cl::sycl::id<1>(0)); + sycl::accessor B(d_B, h, sycl::range<1>(order*order), sycl::id<1>(0)); #endif h.copy(B,h_B.data()); }); q.wait(); } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -259,84 +250,52 @@ int main(int argc, char * argv[]) try { #if SYCL_TRY_CPU_QUEUE - if (1) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, order); - run(host, iterations, order); + if (order<10000) { + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, order); + run(q, iterations, order); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } #endif // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, order); - run(cpu, iterations, order); + run(q, iterations, order); + run(q, iterations, order); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE - if (0) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + if (1) { + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, order); + if (has_spir || has_ptx) { + run(q, iterations, order); if (has_fp64) { - run(gpu, iterations, order); + run(q, iterations, order); } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, order); - if (has_fp64) { - run(gpu, iterations, order); - } -#endif } } #endif } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/Cxx11/transpose-sycl-usm.cc b/Cxx11/transpose-sycl-usm.cc new file mode 100644 index 000000000..a80ce8c83 --- /dev/null +++ b/Cxx11/transpose-sycl-usm.cc @@ -0,0 +1,276 @@ +/// +/// Copyright (c) 2013, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: transpose +/// +/// PURPOSE: This program measures the time for the transpose of a +/// column-major stored matrix into a row-major stored matrix. +/// +/// USAGE: Program input is the matrix order and the number of times to +/// repeat the operation: +/// +/// transpose <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// transpose worked and timing statistics. +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, February 2016 and May 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_sycl.h" +#include "prk_util.h" + +template class transpose; + +template +void run(sycl::queue & q, int iterations, size_t order) +{ + ////////////////////////////////////////////////////////////////////// + // Allocate space for the input and transpose matrix + ////////////////////////////////////////////////////////////////////// + + double trans_time(0); + + auto ctx = q.get_context(); + auto dev = q.get_device(); + + T * A = static_cast(sycl::malloc_shared(order*order * sizeof(T), dev, ctx)); + T * B = static_cast(sycl::malloc_shared(order*order * sizeof(T), dev, ctx)); + + for (auto i=0;i(i*order+j); + B[i*order+j] = 0.0; + } + } + + try { + +#if PREBUILD_KERNEL + sycl::program kernel(ctx); + kernel.build_with_kernel_type>(); +#endif + + + for (int iter = 0; iter<=iterations; ++iter) { + + if (iter==1) trans_time = prk::wtime(); + + q.submit([&](sycl::handler& h) { + + h.parallel_for>( +#if PREBUILD_KERNEL + kernel.get_kernel>(), +#endif + sycl::range<2>{order,order}, [=] (sycl::id<2> it) { +#if USE_2D_INDEXING + sycl::id<2> ij{it[0],it[1]}; + sycl::id<2> ji{it[1],it[0]}; + B[ij] += A[ji]; + A[ji] += (T)1; +#else + B[it[0] * order + it[1]] += A[it[1] * order + it[0]]; + A[it[1] * order + it[0]] += (T)1; +#endif + }); + }); + q.wait(); + } + + // Stop timer before buffer+accessor destructors fire, + // since that will move data, and we do not time that + // for other device-oriented programming models. + trans_time = prk::wtime() - trans_time; + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + return; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return; + } + catch (const char * e) { + std::cout << e << std::endl; + return; + } + + sycl::free(A, ctx); + sycl::free(B, ctx); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + // TODO: replace with std::generate, std::accumulate, or similar + const T addit = (iterations+1.) * (iterations/2.); + double abserr(0); + for (size_t i=0; i(ij)*(1.+iterations)+addit; + abserr += std::fabs(B[ji] - reference); + } + } + +#ifdef VERBOSE + std::cout << "Sum of absolute differences: " << abserr << std::endl; +#endif + + const double epsilon(1.0e-8); + if (abserr < epsilon) { + std::cout << "Solution validates" << std::endl; + double avgtime = trans_time/iterations; + double bytes = (size_t)order * (size_t)order * sizeof(T); + std::cout << 8*sizeof(T) << "B " + << "Rate (MB/s): " << 1.0e-6 * (2.*bytes)/avgtime + << " Avg time (s): " << avgtime << std::endl; + } else { + std::cout << "ERROR: Aggregate squared error " << abserr + << " exceeds threshold " << epsilon << std::endl; + } +} + +int main(int argc, char * argv[]) +{ + std::cout << "Parallel Research Kernels version " << PRKVERSION << std::endl; + std::cout << "C++11/SYCL Matrix transpose: B = A^T" << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Read and test input parameters + ////////////////////////////////////////////////////////////////////// + + int iterations; + size_t order; + try { + if (argc < 3) { + throw "Usage: <# iterations> "; + } + + // number of times to do the transpose + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + // order of a the matrix + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + + ////////////////////////////////////////////////////////////////////// + /// Setup SYCL environment + ////////////////////////////////////////////////////////////////////// + +#ifdef USE_OPENCL + prk::opencl::listPlatforms(); +#endif + + try { +#if SYCL_TRY_CPU_QUEUE + if (order<10000) { + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, order); + run(q, iterations, order); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; + } +#endif + + // CPU requires spir64 target +#if SYCL_TRY_CPU_QUEUE + if (1) { + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + if (has_spir) { + run(q, iterations, order); + run(q, iterations, order); + } + } +#endif + + // NVIDIA GPU requires ptx64 target +#if SYCL_TRY_GPU_QUEUE + if (1) { + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); + if (!has_fp64) { + std::cout << "SYCL GPU device lacks FP64 support." << std::endl; + } + if (has_spir || has_ptx) { + run(q, iterations, order); + if (has_fp64) { + run(q, iterations, order); + } + } + } +#endif + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + return 1; + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + return 1; + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + return 0; +} + + diff --git a/Cxx11/transpose-sycl.cc b/Cxx11/transpose-sycl.cc index b22b162be..289127265 100644 --- a/Cxx11/transpose-sycl.cc +++ b/Cxx11/transpose-sycl.cc @@ -49,18 +49,13 @@ /// ////////////////////////////////////////////////////////////////////// -#include "CL/sycl.hpp" +#include "prk_sycl.h" #include "prk_util.h" -#if 0 -#include "prk_opencl.h" -#define USE_OPENCL 1 -#endif - template class transpose; template -void run(cl::sycl::queue & q, int iterations, size_t order) +void run(sycl::queue & q, int iterations, size_t order) { ////////////////////////////////////////////////////////////////////// // Allocate space for the input and transpose matrix @@ -76,37 +71,39 @@ void run(cl::sycl::queue & q, int iterations, size_t order) try { + auto ctx = q.get_context(); + #if PREBUILD_KERNEL - cl::sycl::program kernel(q.get_context()); + sycl::program kernel(ctx); kernel.build_with_kernel_type>(); #endif #if USE_2D_INDEXING - cl::sycl::buffer d_A( h_A.data(), cl::sycl::range<2>{order,order} ); - cl::sycl::buffer d_B( h_B.data(), cl::sycl::range<2>{order,order} ); + sycl::buffer d_A( h_A.data(), sycl::range<2>{order,order} ); + sycl::buffer d_B( h_B.data(), sycl::range<2>{order,order} ); #else - cl::sycl::buffer d_A { h_A.data(), h_A.size() }; - cl::sycl::buffer d_B { h_B.data(), h_B.size() }; + sycl::buffer d_A { h_A.data(), h_A.size() }; + sycl::buffer d_B { h_B.data(), h_B.size() }; #endif for (int iter = 0; iter<=iterations; ++iter) { if (iter==1) trans_time = prk::wtime(); - q.submit([&](cl::sycl::handler& h) { + q.submit([&](sycl::handler& h) { // accessor methods - auto A = d_A.template get_access(h); - auto B = d_B.template get_access(h); + auto A = d_A.template get_access(h); + auto B = d_B.template get_access(h); h.parallel_for>( #if PREBUILD_KERNEL kernel.get_kernel>(), #endif - cl::sycl::range<2>{order,order}, [=] (cl::sycl::item<2> it) { + sycl::range<2>{order,order}, [=] (sycl::item<2> it) { #if USE_2D_INDEXING - cl::sycl::id<2> ij{it[0],it[1]}; - cl::sycl::id<2> ji{it[1],it[0]}; + sycl::id<2> ij{it[0],it[1]}; + sycl::id<2> ji{it[1],it[0]}; B[ij] += A[ji]; A[ji] += (T)1; #else @@ -123,15 +120,9 @@ void run(cl::sycl::queue & q, int iterations, size_t order) // for other device-oriented programming models. trans_time = prk::wtime() - trans_time; } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return; } catch (std::exception & e) { @@ -225,84 +216,52 @@ int main(int argc, char * argv[]) try { #if SYCL_TRY_CPU_QUEUE - if (1) { - cl::sycl::queue host(cl::sycl::host_selector{}); -#ifndef TRISYCL - auto device = host.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; -#endif - run(host, iterations, order); - run(host, iterations, order); + if (order<10000) { + sycl::queue q(sycl::host_selector{}); + prk::SYCL::print_device_platform(q); + run(q, iterations, order); + run(q, iterations, order); + } else { + std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } #endif // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE if (1) { - cl::sycl::queue cpu(cl::sycl::cpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = cpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); -#else - bool has_spir = true; // ? -#endif + sycl::queue q(sycl::cpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); if (has_spir) { - run(cpu, iterations, order); - run(cpu, iterations, order); + run(q, iterations, order); + run(q, iterations, order); } } #endif - // NVIDIA GPU requires ptx64 target and does not work very well + // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE - if (0) { - cl::sycl::queue gpu(cl::sycl::gpu_selector{}); -#if !defined(TRISYCL) && !defined(__HIPSYCL__) - auto device = gpu.get_device(); - auto platform = device.get_platform(); - std::cout << "SYCL Device: " << device.get_info() << std::endl; - std::cout << "SYCL Platform: " << platform.get_info() << std::endl; - bool has_spir = device.has_extension(cl::sycl::string_class("cl_khr_spir")); - bool has_fp64 = device.has_extension(cl::sycl::string_class("cl_khr_fp64")); -#else - bool has_spir = true; // ? - bool has_fp64 = true; -#endif + if (1) { + sycl::queue q(sycl::gpu_selector{}); + prk::SYCL::print_device_platform(q); + bool has_spir = prk::SYCL::has_spir(q); + bool has_fp64 = prk::SYCL::has_fp64(q); + bool has_ptx = prk::SYCL::has_ptx(q); if (!has_fp64) { std::cout << "SYCL GPU device lacks FP64 support." << std::endl; } - if (has_spir) { - run(gpu, iterations, order); + if (has_spir || has_ptx) { + run(q, iterations, order); if (has_fp64) { - run(gpu, iterations, order); + run(q, iterations, order); } - } else { - std::cout << "SYCL GPU device lacks SPIR-V support." << std::endl; -#ifdef __COMPUTECPP__ - std::cout << "You are using ComputeCpp so we will try it anyways..." << std::endl; - run(gpu, iterations, order); - if (has_fp64) { - run(gpu, iterations, order); - } -#endif } } #endif } - catch (cl::sycl::exception & e) { + catch (sycl::exception & e) { std::cout << e.what() << std::endl; -#ifdef __COMPUTECPP__ - std::cout << e.get_file_name() << std::endl; - std::cout << e.get_line_number() << std::endl; - std::cout << e.get_description() << std::endl; - std::cout << e.get_cl_error_message() << std::endl; - std::cout << e.get_cl_code() << std::endl; -#endif + prk::SYCL::print_exception_details(e); return 1; } catch (std::exception & e) { diff --git a/common/README.freebsd b/common/README.freebsd index 8a52f24da..a55746adc 100644 --- a/common/README.freebsd +++ b/common/README.freebsd @@ -13,7 +13,7 @@ sudo pkg install clang flang libpgmath ## C++ dependencies -sudo pkg install opencl-2.2_1 +sudo pkg install opencl-2.2_1 sudo pkg install devel/clinfo devel/ocl-icd lang/beignet lang/pocl sudo pkg install tbb sudo pkg install boost-all diff --git a/common/make.defs.gcc b/common/make.defs.gcc index f4552bd87..51e0827cb 100644 --- a/common/make.defs.gcc +++ b/common/make.defs.gcc @@ -54,9 +54,9 @@ METALFLAG=-framework MetalPerformanceShaders # # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} -SYCLFLAG=-I$(SYCLDIR)/include +#SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -74,16 +74,19 @@ METALFLAG=-framework MetalPerformanceShaders # # TBB # -TBBDIR=/usr/local/Cellar/tbb/2019_U5_1 -TBBFLAG=-I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBDIR=/usr/lib/x86_64-linux-gnu +TBBDIR=/usr/local/Cellar/tbb/2019_U8 +TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb +#TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb +#TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.69.0_2/include +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} -#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} +#PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=/opt/kokkos/gcc KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} RAJADIR=/opt/raja/gcc @@ -91,33 +94,6 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/Users/jrhammon/Work/NVIDIA/thrust THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} # -# SYCL flags -# -# triSYCL -# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} -O3 -Wall -std=c++17 ${OPENMPFLAG} -SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -DTRISYCL -# ProGTX -# https://github.com/ProGTX/sycl-gtx -#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -#SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} -SYCLFLAG+=${RANGEFLAG} -# -# SYCL flags -# -# triSYCL -# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -SYCLCXX=${CXX} -std=c++17 ${OPENMPFLAG} -SYCLFLAG=-I${SYCLDIR}/include ${BOOSTFLAG} -# ProGTX -# https://github.com/ProGTX/sycl-gtx -#SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx -#SYCLCXX=${CXX} ${OPENMPFLAG} -#SYCLFLAG=-DUSE_SYCL -I${SYCLDIR}/sycl-gtx/include -L${SYCLDIR}/build/sycl-gtx -lsycl-gtx ${OPENCLFLAG} -# # CBLAS for C++ DGEMM # BLASFLAG=-DACCELERATE -framework Accelerate diff --git a/common/make.defs.llvm b/common/make.defs.llvm index 318e64595..5804f0681 100644 --- a/common/make.defs.llvm +++ b/common/make.defs.llvm @@ -4,7 +4,7 @@ # # Base compilers and language options # -#LLVM_ROOT=/usr/local/Cellar/llvm/6.0.0 +#LLVM_ROOT=/usr/local/Cellar/llvm/9.0.0 #LLVM_PATH=${LLVM_ROOT}/bin/ #LLVM_PATH=/opt/llvm/HEAD/bin/ # C99 is required in some implementations. @@ -30,6 +30,10 @@ DEFAULT_OPT_FLAGS=-g -O3 -mtune=native -ffast-math # These are useful to understand why the compiler does not vectorize loops: # DEFAULT_OPT_FLAGS+=-Rpass-analysis=loop-vectorize # DEFAULT_OPT_FLAGS+=-Rpass=loop-vectorize +#DEFAULT_OPT_FLAGS+=-fopt-info-vec-missed +DEFAULT_OPT_FLAGS+=-Wall #-Werror +DEFAULT_OPT_FLAGS+=-Wno-ignored-attributes -Wno-deprecated-declarations +#DEFAULT_OPT_FLAGS+=-mavx -mfma # # OpenMP flags # @@ -81,9 +85,8 @@ OPENMPFLAG+=-L${LLVM_ROOT}/lib # triSYCL # https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... SYCLDIR=./triSYCL -#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) -SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL +SYCLFLAG=-std=c++17 -I$(SYCLDIR)/include -DTRISYCL # ProGTX # https://github.com/ProGTX/sycl-gtx #SYCLDIR=${HOME}/Work/OpenCL/sycl-gtx @@ -97,14 +100,14 @@ SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL # TBB # #TBBDIR=/usr/lib/x86_64-linux-gnu -TBBDIR=/usr/local/Cellar/tbb/2018_U3_1 +TBBDIR=/usr/local/Cellar/tbb/2019_U8 TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -ltbb #TBBDIR=/opt/intel/compilers_and_libraries_2019.2.159/linux/tbb #TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBDIR}/lib -L${TBBDIR}/lib/intel64_lin/gcc4.7 -ltbb # # Parallel STL, Boost, etc. # -#BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include +#BOOSTFLAG=-I/usr/local/Cellar/boost/1.71.0/include #RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages @@ -130,6 +133,16 @@ CUDAFLAGS=-g -O3 -std=c++11 # https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED # +# Halide +# +HALIDECXX=c++ +HALIDEDIR=/opt/halide +HALIDEFLAG=-I${HALIDEDIR}/include +HALIDEFLAG+=-L${HALIDEDIR}/lib -lhalide +#HALIDEFLAG+=-D_GLIBCXX_USE_CXX11_ABI=0 +HALIDEFLAG+=${DEFAULT_OPT_FLAGS} +HALIDEFLAG+=-std=c++17 -g3 +# # ISPC # ISPC=ispc From 6f7bc302aa85c59097bbbc92d10486f6019c376d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 8 Oct 2019 21:54:41 -0700 Subject: [PATCH 227/245] IBM compiler fixes (#421) * XLF also missing norm2 intrinsic * remove unnecessary (erroneous) declare target * add IBM POWER9 + NVIDIA V100 * rename for preprocessing * IBM Clang fixes * do not map RO arrays as tofrom * override _OPENMP for XLC * more IBM NV fixes * add prk::alloc/dealloc to match C (unused) * Kokkos-CUDA still broken... * remove unnecessary range * add missing includes * update example build file --- C1z/Makefile | 4 + C1z/prk_util.h | 2 +- Cxx11/Makefile | 5 + Cxx11/nstream-device-thrust.cu | 3 +- Cxx11/nstream-openmp-target.cc | 4 +- Cxx11/prk_openmp.h | 2 +- Cxx11/prk_util.h | 27 +++- Cxx11/transpose-device-thrust.cu | 2 + FORTRAN/Makefile | 28 ++-- ...nmp-target.f90 => dgemm-openmp-target.F90} | 0 .../{dgemm-pretty.f90 => dgemm-pretty.F90} | 0 ...p-openmp.f90 => dgemm-taskloop-openmp.F90} | 0 FORTRAN/{dgemm.f90 => dgemm.F90} | 4 +- ...p-target.f90 => nstream-openmp-target.F90} | 0 ...stream-ornlacc.f90 => nstream-ornlacc.F90} | 0 ...{nstream-pretty.f90 => nstream-pretty.F90} | 0 ...openmp.f90 => nstream-taskloop-openmp.F90} | 0 FORTRAN/{nstream.f90 => nstream.F90} | 0 ...sync-ornlacc.f90 => p2p-async-ornlacc.F90} | 0 FORTRAN/{p2p-coarray.f90 => p2p-coarray.F90} | 0 ...oss-openmp.f90 => p2p-doacross-openmp.F90} | 0 ...op-openmp.f90 => p2p-innerloop-openmp.F90} | 0 ...-ornlacc.f90 => p2p-innerloop-ornlacc.F90} | 0 .../{p2p-innerloop.f90 => p2p-innerloop.F90} | 0 ...penmp-target.f90 => p2p-openmp-target.F90} | 0 FORTRAN/{p2p-ornlacc.f90 => p2p-ornlacc.F90} | 0 ...-tasks-openmp.f90 => p2p-tasks-openmp.F90} | 0 FORTRAN/{p2p.f90 => p2p.F90} | 0 ...tencil-coarray.f90 => stencil-coarray.F90} | 0 ...p-target.f90 => stencil-openmp-target.F90} | 1 - ...tencil-ornlacc.f90 => stencil-ornlacc.F90} | 0 ...{stencil-pretty.f90 => stencil-pretty.F90} | 0 ...openmp.f90 => stencil-taskloop-openmp.F90} | 0 FORTRAN/{stencil.f90 => stencil.F90} | 0 ...{stencil_openmp.f90 => stencil_openmp.F90} | 0 ...{stencil_pretty.f90 => stencil_pretty.F90} | 0 ...{stencil_serial.f90 => stencil_serial.F90} | 0 ...{stencil_target.f90 => stencil_target.F90} | 0 ...ncil_taskloop.f90 => stencil_taskloop.F90} | 0 ...pose-coarray.f90 => transpose-coarray.F90} | 0 ...target.f90 => transpose-openmp-target.F90} | 0 ...pose-ornlacc.f90 => transpose-ornlacc.F90} | 0 ...nspose-pretty.f90 => transpose-pretty.F90} | 2 +- ...enmp.f90 => transpose-taskloop-openmp.F90} | 0 ...-openmp.f90 => transpose-tasks-openmp.F90} | 0 FORTRAN/{transpose.f90 => transpose.F90} | 0 common/make.defs.ibmp9nv | 121 ++++++++++++++++++ 47 files changed, 182 insertions(+), 23 deletions(-) rename FORTRAN/{dgemm-openmp-target.f90 => dgemm-openmp-target.F90} (100%) rename FORTRAN/{dgemm-pretty.f90 => dgemm-pretty.F90} (100%) rename FORTRAN/{dgemm-taskloop-openmp.f90 => dgemm-taskloop-openmp.F90} (100%) rename FORTRAN/{dgemm.f90 => dgemm.F90} (99%) rename FORTRAN/{nstream-openmp-target.f90 => nstream-openmp-target.F90} (100%) rename FORTRAN/{nstream-ornlacc.f90 => nstream-ornlacc.F90} (100%) rename FORTRAN/{nstream-pretty.f90 => nstream-pretty.F90} (100%) rename FORTRAN/{nstream-taskloop-openmp.f90 => nstream-taskloop-openmp.F90} (100%) rename FORTRAN/{nstream.f90 => nstream.F90} (100%) rename FORTRAN/{p2p-async-ornlacc.f90 => p2p-async-ornlacc.F90} (100%) rename FORTRAN/{p2p-coarray.f90 => p2p-coarray.F90} (100%) rename FORTRAN/{p2p-doacross-openmp.f90 => p2p-doacross-openmp.F90} (100%) rename FORTRAN/{p2p-innerloop-openmp.f90 => p2p-innerloop-openmp.F90} (100%) rename FORTRAN/{p2p-innerloop-ornlacc.f90 => p2p-innerloop-ornlacc.F90} (100%) rename FORTRAN/{p2p-innerloop.f90 => p2p-innerloop.F90} (100%) rename FORTRAN/{p2p-openmp-target.f90 => p2p-openmp-target.F90} (100%) rename FORTRAN/{p2p-ornlacc.f90 => p2p-ornlacc.F90} (100%) rename FORTRAN/{p2p-tasks-openmp.f90 => p2p-tasks-openmp.F90} (100%) rename FORTRAN/{p2p.f90 => p2p.F90} (100%) rename FORTRAN/{stencil-coarray.f90 => stencil-coarray.F90} (100%) rename FORTRAN/{stencil-openmp-target.f90 => stencil-openmp-target.F90} (99%) rename FORTRAN/{stencil-ornlacc.f90 => stencil-ornlacc.F90} (100%) rename FORTRAN/{stencil-pretty.f90 => stencil-pretty.F90} (100%) rename FORTRAN/{stencil-taskloop-openmp.f90 => stencil-taskloop-openmp.F90} (100%) rename FORTRAN/{stencil.f90 => stencil.F90} (100%) rename FORTRAN/{stencil_openmp.f90 => stencil_openmp.F90} (100%) rename FORTRAN/{stencil_pretty.f90 => stencil_pretty.F90} (100%) rename FORTRAN/{stencil_serial.f90 => stencil_serial.F90} (100%) rename FORTRAN/{stencil_target.f90 => stencil_target.F90} (100%) rename FORTRAN/{stencil_taskloop.f90 => stencil_taskloop.F90} (100%) rename FORTRAN/{transpose-coarray.f90 => transpose-coarray.F90} (100%) rename FORTRAN/{transpose-openmp-target.f90 => transpose-openmp-target.F90} (100%) rename FORTRAN/{transpose-ornlacc.f90 => transpose-ornlacc.F90} (100%) rename FORTRAN/{transpose-pretty.f90 => transpose-pretty.F90} (99%) rename FORTRAN/{transpose-taskloop-openmp.f90 => transpose-taskloop-openmp.F90} (100%) rename FORTRAN/{transpose-tasks-openmp.f90 => transpose-tasks-openmp.F90} (100%) rename FORTRAN/{transpose.f90 => transpose.F90} (100%) create mode 100644 common/make.defs.ibmp9nv diff --git a/C1z/Makefile b/C1z/Makefile index 9125fef1f..0c854088e 100644 --- a/C1z/Makefile +++ b/C1z/Makefile @@ -42,6 +42,10 @@ endif ifneq ($(CILKFLAG),) EXTRA += cilk endif +ifeq ($(findstring xlc,$(CC)),xlc) + EXTRA = target + CFLAGS += -DXLC +endif all: serial thread openmp taskloop $(EXTRA) diff --git a/C1z/prk_util.h b/C1z/prk_util.h index 313cca471..1cb2d4467 100644 --- a/C1z/prk_util.h +++ b/C1z/prk_util.h @@ -77,7 +77,7 @@ const bool false=0; # define OMP_BARRIER PRAGMA(omp barrier) # define OMP_FOR(x) PRAGMA(omp for x) # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) ) -# if (_OPENMP >= 201300) +# if (_OPENMP >= 201300) || (__ibmxl_version__ >= 16) # define OMP_SIMD PRAGMA(omp simd) # define OMP_FOR_SIMD(x) PRAGMA(omp for simd x) # define OMP_TASK(x) PRAGMA(omp task x) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 84665feaf..596c87793 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -193,9 +193,14 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h %-raja: %-raja.cc prk_util.h $(CXX) $(CXXFLAGS) $< $(RAJAFLAGS) -o $@ +ifeq ($(PRK_KOKKOS_BACKEND),Cuda) +%-kokkos: %-kokkos.cc prk_util.h + ${KOKKOSDIR}/bin/nvcc_wrapper $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@ +else %-kokkos: %-kokkos.cc prk_util.h $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) $(CXX) $(CXXFLAGS) $< $(KOKKOSFLAGS) -o $@ +endif # for host execution %-thrust: %-thrust.cc prk_util.h diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu index 13cd1a4e5..8ecbee9bf 100644 --- a/Cxx11/nstream-device-thrust.cu +++ b/Cxx11/nstream-device-thrust.cu @@ -64,6 +64,7 @@ #include "prk_util.h" #include "prk_cuda.h" +#include "prk_thrust.h" int main(int argc, char * argv[]) { @@ -115,8 +116,6 @@ int main(int argc, char * argv[]) thrust::device_vector B(length); thrust::device_vector C(length); - auto range = prk::range(static_cast(0), length); - double scalar(3); { thrust::fill(thrust::device, A.begin(), A.end(), 0.0); diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc index d4a437a08..8715962a8 100644 --- a/Cxx11/nstream-openmp-target.cc +++ b/Cxx11/nstream-openmp-target.cc @@ -129,9 +129,9 @@ int main(int argc, char * argv[]) } // DEVICE - OMP_TARGET( data map(tofrom: A[0:length], B[0:length], C[0:length]) ) + OMP_TARGET( data map(tofrom: A[0:length]) map(to: B[0:length], C[0:length]) ) { - for (auto iter = 0; iter<=iterations; iter++) { + for (int iter = 0; iter<=iterations; iter++) { if (iter==1) nstream_time = prk::wtime(); diff --git a/Cxx11/prk_openmp.h b/Cxx11/prk_openmp.h index 578e713e5..c562630f3 100644 --- a/Cxx11/prk_openmp.h +++ b/Cxx11/prk_openmp.h @@ -44,7 +44,7 @@ # define OMP_FOR(x) PRAGMA(omp for x) # define OMP_FOR_REDUCE(x) PRAGMA(omp for reduction (x) ) // OpenMP SIMD if supported, else not. -# if (_OPENMP >= 201300) +# if (_OPENMP >= 201300) || (__ibmxl_version__ >= 16) # define OMP_SIMD PRAGMA(omp simd) # define OMP_FOR_SIMD PRAGMA(omp for simd) # define OMP_TASK(x) PRAGMA(omp task x) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index abdf6388d..ed798b0b1 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -40,7 +40,7 @@ #include // Test standard library _after_ standard headers have been included... -#if !defined(__NVCC__) && !defined(__PGI) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI) +#if !defined(__NVCC__) && !defined(__PGI) && !defined(__ibmxl__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI) # error You are using an ancient version GNU libstdc++. Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option. #endif @@ -278,6 +278,31 @@ namespace prk { return ( numerator / denominator + (numerator % denominator > 0) ); } + template + T * alloc(size_t bytes) + { + int alignment = ::prk::get_alignment(); +#if defined(__INTEL_COMPILER) + return (void*)_mm_malloc(bytes,alignment); +#else + T * ptr = nullptr; + int ret = posix_memalign((void**)&ptr,alignment,bytes); + if (ret!=0) ptr = NULL; + return ptr; +#endif + + } + + template + void dealloc(T * p) + { +#if defined(__INTEL_COMPILER) + _mm_free((void*)p); +#else + free((void*)p); +#endif + } + } // namespace prk #endif /* PRK_UTIL_H */ diff --git a/Cxx11/transpose-device-thrust.cu b/Cxx11/transpose-device-thrust.cu index 907f45e94..044032dd2 100644 --- a/Cxx11/transpose-device-thrust.cu +++ b/Cxx11/transpose-device-thrust.cu @@ -50,6 +50,8 @@ ////////////////////////////////////////////////////////////////////// #include "prk_util.h" +#include "prk_cuda.h" +#include "prk_thrust.h" struct x : public thrust::unary_function { diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index d96f87cce..e9b1fa471 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -8,10 +8,10 @@ ifndef RADIUS RADIUS=2 endif -STARFLAG = -DSTAR +STARFLAG = $(XLFPP)-DSTAR FCFLAGS = $(DEFAULT_OPT_FLAGS) -FCFLAGS += -DRADIUS=$(RADIUS) $(STARFLAG) +FCFLAGS += $(XLFPP)-DRADIUS=$(RADIUS) $(STARFLAG) ifeq ($(findstring ifort,$(FC)),ifort) BLASFLAGS += -heap-arrays @@ -40,6 +40,10 @@ ifeq ($(findstring flang,$(FC)),flang) EXTRA = target ornlacc FCFLAGS += -DPGI endif +ifeq ($(findstring xlf,$(FC)),xlf) + EXTRA = target + FCFLAGS += $(XLFPP)-DXLF +endif all: serial pretty openmp tasks $(EXTRA) @@ -59,32 +63,32 @@ target: stencil-openmp-target transpose-openmp-target nstream-openmp-target dgem ornlacc: p2p-ornlacc p2p-innerloop-ornlacc stencil-ornlacc transpose-ornlacc nstream-ornlacc -%: %.f90 +%: %.F90 $(FC) $(FCFLAGS) $< -o $@ -stencil: stencil.f90 stencil_serial.f90 - #$(FC) $(FCFLAGS) -c stencil_serial.f90 -o stencil_serial.o +stencil: stencil.F90 stencil_serial.F90 + #$(FC) $(FCFLAGS) -c stencil_serial.F90 -o stencil_serial.o $(FC) $(FCFLAGS) $< -o $@ -dgemm-pretty: dgemm-pretty.f90 +dgemm-pretty: dgemm-pretty.F90 $(FC) $(FCFLAGS) $(BLASFLAGS) $< -o $@ -%-pretty: %-pretty.f90 +%-pretty: %-pretty.F90 $(FC) $(FCFLAGS) $< -o $@ -%-openmp: %.f90 +%-openmp: %.F90 $(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ -%-openmp: %-openmp.f90 +%-openmp: %-openmp.F90 $(FC) $(FCFLAGS) $(OPENMPFLAG) $< -o $@ -%-coarray: %-coarray.f90 +%-coarray: %-coarray.F90 $(CAFC) $(FCFLAGS) $< $(COARRAYFLAG) -o $@ -%-target: %-target.f90 +%-target: %-target.F90 $(FC) $(FCFLAGS) $(OPENMPFLAG) $(OFFLOADFLAG) $< -o $@ -%-ornlacc: %-ornlacc.f90 +%-ornlacc: %-ornlacc.F90 $(FC) $(FCFLAGS) $(ORNLACCFLAG) $< -o $@ clean: diff --git a/FORTRAN/dgemm-openmp-target.f90 b/FORTRAN/dgemm-openmp-target.F90 similarity index 100% rename from FORTRAN/dgemm-openmp-target.f90 rename to FORTRAN/dgemm-openmp-target.F90 diff --git a/FORTRAN/dgemm-pretty.f90 b/FORTRAN/dgemm-pretty.F90 similarity index 100% rename from FORTRAN/dgemm-pretty.f90 rename to FORTRAN/dgemm-pretty.F90 diff --git a/FORTRAN/dgemm-taskloop-openmp.f90 b/FORTRAN/dgemm-taskloop-openmp.F90 similarity index 100% rename from FORTRAN/dgemm-taskloop-openmp.f90 rename to FORTRAN/dgemm-taskloop-openmp.F90 diff --git a/FORTRAN/dgemm.f90 b/FORTRAN/dgemm.F90 similarity index 99% rename from FORTRAN/dgemm.f90 rename to FORTRAN/dgemm.F90 index 7123882a6..55edc2977 100644 --- a/FORTRAN/dgemm.f90 +++ b/FORTRAN/dgemm.F90 @@ -279,13 +279,13 @@ program main forder = real(order,REAL64) reference = 0.25d0 * forder**3 * (forder-1)**2 * (iterations+1) checksum = 0.0d0 - !$omp parallel do simd reduction(+:checksum) + !$omp parallel do reduction(+:checksum) do j=1,order do i=1,order checksum = checksum + C(i,j) enddo enddo - !$omp end parallel do simd + !$omp end parallel do deallocate( C ) diff --git a/FORTRAN/nstream-openmp-target.f90 b/FORTRAN/nstream-openmp-target.F90 similarity index 100% rename from FORTRAN/nstream-openmp-target.f90 rename to FORTRAN/nstream-openmp-target.F90 diff --git a/FORTRAN/nstream-ornlacc.f90 b/FORTRAN/nstream-ornlacc.F90 similarity index 100% rename from FORTRAN/nstream-ornlacc.f90 rename to FORTRAN/nstream-ornlacc.F90 diff --git a/FORTRAN/nstream-pretty.f90 b/FORTRAN/nstream-pretty.F90 similarity index 100% rename from FORTRAN/nstream-pretty.f90 rename to FORTRAN/nstream-pretty.F90 diff --git a/FORTRAN/nstream-taskloop-openmp.f90 b/FORTRAN/nstream-taskloop-openmp.F90 similarity index 100% rename from FORTRAN/nstream-taskloop-openmp.f90 rename to FORTRAN/nstream-taskloop-openmp.F90 diff --git a/FORTRAN/nstream.f90 b/FORTRAN/nstream.F90 similarity index 100% rename from FORTRAN/nstream.f90 rename to FORTRAN/nstream.F90 diff --git a/FORTRAN/p2p-async-ornlacc.f90 b/FORTRAN/p2p-async-ornlacc.F90 similarity index 100% rename from FORTRAN/p2p-async-ornlacc.f90 rename to FORTRAN/p2p-async-ornlacc.F90 diff --git a/FORTRAN/p2p-coarray.f90 b/FORTRAN/p2p-coarray.F90 similarity index 100% rename from FORTRAN/p2p-coarray.f90 rename to FORTRAN/p2p-coarray.F90 diff --git a/FORTRAN/p2p-doacross-openmp.f90 b/FORTRAN/p2p-doacross-openmp.F90 similarity index 100% rename from FORTRAN/p2p-doacross-openmp.f90 rename to FORTRAN/p2p-doacross-openmp.F90 diff --git a/FORTRAN/p2p-innerloop-openmp.f90 b/FORTRAN/p2p-innerloop-openmp.F90 similarity index 100% rename from FORTRAN/p2p-innerloop-openmp.f90 rename to FORTRAN/p2p-innerloop-openmp.F90 diff --git a/FORTRAN/p2p-innerloop-ornlacc.f90 b/FORTRAN/p2p-innerloop-ornlacc.F90 similarity index 100% rename from FORTRAN/p2p-innerloop-ornlacc.f90 rename to FORTRAN/p2p-innerloop-ornlacc.F90 diff --git a/FORTRAN/p2p-innerloop.f90 b/FORTRAN/p2p-innerloop.F90 similarity index 100% rename from FORTRAN/p2p-innerloop.f90 rename to FORTRAN/p2p-innerloop.F90 diff --git a/FORTRAN/p2p-openmp-target.f90 b/FORTRAN/p2p-openmp-target.F90 similarity index 100% rename from FORTRAN/p2p-openmp-target.f90 rename to FORTRAN/p2p-openmp-target.F90 diff --git a/FORTRAN/p2p-ornlacc.f90 b/FORTRAN/p2p-ornlacc.F90 similarity index 100% rename from FORTRAN/p2p-ornlacc.f90 rename to FORTRAN/p2p-ornlacc.F90 diff --git a/FORTRAN/p2p-tasks-openmp.f90 b/FORTRAN/p2p-tasks-openmp.F90 similarity index 100% rename from FORTRAN/p2p-tasks-openmp.f90 rename to FORTRAN/p2p-tasks-openmp.F90 diff --git a/FORTRAN/p2p.f90 b/FORTRAN/p2p.F90 similarity index 100% rename from FORTRAN/p2p.f90 rename to FORTRAN/p2p.F90 diff --git a/FORTRAN/stencil-coarray.f90 b/FORTRAN/stencil-coarray.F90 similarity index 100% rename from FORTRAN/stencil-coarray.f90 rename to FORTRAN/stencil-coarray.F90 diff --git a/FORTRAN/stencil-openmp-target.f90 b/FORTRAN/stencil-openmp-target.F90 similarity index 99% rename from FORTRAN/stencil-openmp-target.f90 rename to FORTRAN/stencil-openmp-target.F90 index 7bceb70e1..f910f3245 100644 --- a/FORTRAN/stencil-openmp-target.f90 +++ b/FORTRAN/stencil-openmp-target.F90 @@ -94,7 +94,6 @@ end subroutine initialize_w subroutine apply_stencil(is_star,tiling,tile_size,r,n,W,A,B) use iso_fortran_env implicit none - !$omp declare target logical, intent(in) :: is_star, tiling integer(kind=INT32), intent(in) :: tile_size, r, n real(kind=REAL64), intent(in) :: W(-r:r,-r:r) diff --git a/FORTRAN/stencil-ornlacc.f90 b/FORTRAN/stencil-ornlacc.F90 similarity index 100% rename from FORTRAN/stencil-ornlacc.f90 rename to FORTRAN/stencil-ornlacc.F90 diff --git a/FORTRAN/stencil-pretty.f90 b/FORTRAN/stencil-pretty.F90 similarity index 100% rename from FORTRAN/stencil-pretty.f90 rename to FORTRAN/stencil-pretty.F90 diff --git a/FORTRAN/stencil-taskloop-openmp.f90 b/FORTRAN/stencil-taskloop-openmp.F90 similarity index 100% rename from FORTRAN/stencil-taskloop-openmp.f90 rename to FORTRAN/stencil-taskloop-openmp.F90 diff --git a/FORTRAN/stencil.f90 b/FORTRAN/stencil.F90 similarity index 100% rename from FORTRAN/stencil.f90 rename to FORTRAN/stencil.F90 diff --git a/FORTRAN/stencil_openmp.f90 b/FORTRAN/stencil_openmp.F90 similarity index 100% rename from FORTRAN/stencil_openmp.f90 rename to FORTRAN/stencil_openmp.F90 diff --git a/FORTRAN/stencil_pretty.f90 b/FORTRAN/stencil_pretty.F90 similarity index 100% rename from FORTRAN/stencil_pretty.f90 rename to FORTRAN/stencil_pretty.F90 diff --git a/FORTRAN/stencil_serial.f90 b/FORTRAN/stencil_serial.F90 similarity index 100% rename from FORTRAN/stencil_serial.f90 rename to FORTRAN/stencil_serial.F90 diff --git a/FORTRAN/stencil_target.f90 b/FORTRAN/stencil_target.F90 similarity index 100% rename from FORTRAN/stencil_target.f90 rename to FORTRAN/stencil_target.F90 diff --git a/FORTRAN/stencil_taskloop.f90 b/FORTRAN/stencil_taskloop.F90 similarity index 100% rename from FORTRAN/stencil_taskloop.f90 rename to FORTRAN/stencil_taskloop.F90 diff --git a/FORTRAN/transpose-coarray.f90 b/FORTRAN/transpose-coarray.F90 similarity index 100% rename from FORTRAN/transpose-coarray.f90 rename to FORTRAN/transpose-coarray.F90 diff --git a/FORTRAN/transpose-openmp-target.f90 b/FORTRAN/transpose-openmp-target.F90 similarity index 100% rename from FORTRAN/transpose-openmp-target.f90 rename to FORTRAN/transpose-openmp-target.F90 diff --git a/FORTRAN/transpose-ornlacc.f90 b/FORTRAN/transpose-ornlacc.F90 similarity index 100% rename from FORTRAN/transpose-ornlacc.f90 rename to FORTRAN/transpose-ornlacc.F90 diff --git a/FORTRAN/transpose-pretty.f90 b/FORTRAN/transpose-pretty.F90 similarity index 99% rename from FORTRAN/transpose-pretty.f90 rename to FORTRAN/transpose-pretty.F90 index 31c88b378..6185431a4 100644 --- a/FORTRAN/transpose-pretty.f90 +++ b/FORTRAN/transpose-pretty.F90 @@ -152,7 +152,7 @@ program main A = ( transpose(reshape((/ (j2, j2 = 0,o2) /),(/order, order/))) & * real(iterations+1,REAL64) ) & + real((iterations*(iterations+1))/2,REAL64) -#if defined(PGI) +#if defined(PGI) || defined(XLF) abserr = 0.0d0 do j=1,order do i=1,order diff --git a/FORTRAN/transpose-taskloop-openmp.f90 b/FORTRAN/transpose-taskloop-openmp.F90 similarity index 100% rename from FORTRAN/transpose-taskloop-openmp.f90 rename to FORTRAN/transpose-taskloop-openmp.F90 diff --git a/FORTRAN/transpose-tasks-openmp.f90 b/FORTRAN/transpose-tasks-openmp.F90 similarity index 100% rename from FORTRAN/transpose-tasks-openmp.f90 rename to FORTRAN/transpose-tasks-openmp.F90 diff --git a/FORTRAN/transpose.f90 b/FORTRAN/transpose.F90 similarity index 100% rename from FORTRAN/transpose.f90 rename to FORTRAN/transpose.F90 diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv new file mode 100644 index 000000000..3fd2badf8 --- /dev/null +++ b/common/make.defs.ibmp9nv @@ -0,0 +1,121 @@ +# +# This file shows the IBM POWER9 + NVIDIA V100 toolchain options for PRKs using +# OpenMP, MPI and/or Fortran (sans coarrays) only. +# +# Base compilers and language options +# +# C99 is required in some implementations. +CC=xlc_r -qlanglvl=stdc99 +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +# You might need to modify the build system for the preprocessor options to work. +FC=xlf2008_r +XLFPP=-WF, +# C++11 may not be required but does no harm here. +CXX=xlc++_r -qlanglvl=extended1y +# +# Compiler flags +# +DEFAULT_OPT_FLAGS=-O3 +# +# OpenMP flags +# +# You can also use -qopenmp. -openmp is deprecated. +OPENMPFLAG=-qsmp=omp +OPENMPSIMDFLAG=-qsmp=omp +OFFLOADFLAG=-qoffload -qtgtarch=sm_70 +# +# OpenCL flags +# +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL +# Linux +OPENCLDIR=/usr +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations +# +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# TBB +# +TBBDIR=${HOME}/TBB +TBBLIBDIR=${HOME}/TBB/build/linux_ppc64le_xl_cc4.8.5_libc2.17_kernel4.14.0_release +TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBLIBDIR} -ltbb +# +# Parallel STL, Boost, etc. +# +BOOSTROOT=${HOME}/boost_1_71_0/include +BOOSTFLAG= +BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include +BOOSTFLAG+=-I${BOOSTROOT}/compute/include +BOOSTFLAG+=-I${BOOSTROOT}/algorithm/include +BOOSTFLAG+=-I${BOOSTROOT}/config/include +BOOSTFLAG+=-I${BOOSTROOT}/core/include +BOOSTFLAG+=-I${BOOSTROOT}/log/include +BOOSTFLAG+=-I${BOOSTROOT}/array/include +BOOSTFLAG+=-I${BOOSTROOT}/multi_array/include +BOOSTFLAG+=-I${BOOSTROOT}/optional/include +BOOSTFLAG+=-I${BOOSTROOT}/preprocessor/include +BOOSTFLAG+=-I${BOOSTROOT}/type_index/include +BOOSTFLAG+=-I${BOOSTROOT}/utility/include +BOOSTFLAG+=-I${BOOSTROOT}/assert/include +BOOSTFLAG+=-I${BOOSTROOT}/static_assert/include +BOOSTFLAG+=-I${BOOSTROOT}/exception/include +BOOSTFLAG+=-I${BOOSTROOT}/throw_exception/include +BOOSTFLAG+=-I${BOOSTROOT}/concept_check/include +BOOSTFLAG+=-I${BOOSTROOT}/type_traits/include +BOOSTFLAG+=-I${BOOSTROOT}/iterator/include +BOOSTFLAG+=-I${BOOSTROOT}/mpl/include +BOOSTFLAG+=-I${BOOSTROOT}/detail/include +BOOSTFLAG+=-I${BOOSTROOT}/functional/include +BOOSTFLAG+=-I${BOOSTROOT}/move/include +BOOSTFLAG+=-I${BOOSTROOT}/range/include +BOOSTFLAG+=-I${BOOSTROOT}/function/include +BOOSTFLAG+=-I${BOOSTROOT}/integer/include +BOOSTFLAG+=-I${BOOSTROOT}/container_hash/include +BOOSTFLAG+=-I${BOOSTROOT}/bind/include +BOOSTFLAG+=-I${BOOSTROOT}/chrono/include +BOOSTFLAG+=-I${BOOSTROOT}/predef/include +BOOSTFLAG+=-I${BOOSTROOT}/ratio/include +BOOSTFLAG+=-I${BOOSTROOT}/function_types/include +BOOSTFLAG+=-I${BOOSTROOT}/tuple/include +BOOSTFLAG+=-I${BOOSTROOT}/lexical_cast/include +BOOSTFLAG+=-I${BOOSTROOT}/numeric/conversion/include +BOOSTFLAG+=-I${BOOSTROOT}/container/include +BOOSTFLAG+=-I${BOOSTROOT}/math/include +BOOSTFLAG+=-I${BOOSTROOT}/fusion/include +BOOSTFLAG+=-I${BOOSTROOT}/typeof/include +BOOSTFLAG+=-I${BOOSTROOT}/uuid/include +BOOSTFLAG+=-I${BOOSTROOT}/smart_ptr/include +BOOSTFLAG+=-I${BOOSTROOT}/proto/include +BOOSTFLAG+=-DBOOST_COMPUTE_USE_CPP11 +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages +KOKKOSDIR=${HOME}/KOKKOS/install-cuda +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl +RAJADIR= +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include +THRUSTFLAG=-I${THRUSTDIR} +# +# CBLAS for C++ DGEMM +# +BLASFLAG=-DESSL +CBLASFLAG=-DESSL +# +# CUDA flags +# +# Linux w/ NVIDIA CUDA +NVCC=/usr/local/cuda-10.1/bin/nvcc -arch=sm_70 +CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda From c6aca6b603d3da5d35ed716234977067031fef3a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Thu, 10 Oct 2019 11:24:18 -0400 Subject: [PATCH 228/245] C++ nstream error check precision fix (#423) --- Cxx11/nstream-cublas.cu | 1 + Cxx11/nstream-cuda.cu | 1 + Cxx11/nstream-device-thrust.cu | 1 + Cxx11/nstream-host-thrust.cc | 1 + Cxx11/nstream-kokkos.cc | 1 + Cxx11/nstream-occa.cc | 1 + Cxx11/nstream-opencl.cc | 3 ++- Cxx11/nstream-openmp-target.cc | 1 + Cxx11/nstream-openmp.cc | 1 + Cxx11/nstream-raja.cc | 1 + Cxx11/nstream-sycl-explicit.cc | 3 ++- Cxx11/nstream-sycl-usm.cc | 3 ++- Cxx11/nstream-sycl.cc | 3 ++- Cxx11/nstream.cc | 1 + 14 files changed, 18 insertions(+), 4 deletions(-) diff --git a/Cxx11/nstream-cublas.cu b/Cxx11/nstream-cublas.cu index 65989a3af..ffd8fa0b0 100644 --- a/Cxx11/nstream-cublas.cu +++ b/Cxx11/nstream-cublas.cu @@ -199,6 +199,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-cuda.cu b/Cxx11/nstream-cuda.cu index 4597021bb..594d75369 100644 --- a/Cxx11/nstream-cuda.cu +++ b/Cxx11/nstream-cuda.cu @@ -207,6 +207,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-device-thrust.cu b/Cxx11/nstream-device-thrust.cu index 8ecbee9bf..7f2ea6168 100644 --- a/Cxx11/nstream-device-thrust.cu +++ b/Cxx11/nstream-device-thrust.cu @@ -162,6 +162,7 @@ int main(int argc, char * argv[]) double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-host-thrust.cc b/Cxx11/nstream-host-thrust.cc index ac82f33d3..7b5123c06 100644 --- a/Cxx11/nstream-host-thrust.cc +++ b/Cxx11/nstream-host-thrust.cc @@ -160,6 +160,7 @@ int main(int argc, char * argv[]) double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-kokkos.cc b/Cxx11/nstream-kokkos.cc index be425e75b..0d09d4079 100644 --- a/Cxx11/nstream-kokkos.cc +++ b/Cxx11/nstream-kokkos.cc @@ -177,6 +177,7 @@ int main(int argc, char * argv[]) double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-occa.cc b/Cxx11/nstream-occa.cc index 6d584e893..ee2e15e61 100644 --- a/Cxx11/nstream-occa.cc +++ b/Cxx11/nstream-occa.cc @@ -188,6 +188,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-opencl.cc b/Cxx11/nstream-opencl.cc index 4ef40bd64..103980dc6 100644 --- a/Cxx11/nstream-opencl.cc +++ b/Cxx11/nstream-opencl.cc @@ -117,7 +117,7 @@ void run(cl::Context context, int iterations, size_t length) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - T ar(0); + double ar(0); T br(2); T cr(2); for (auto i=0; i<=iterations; i++) { @@ -134,6 +134,7 @@ void run(cl::Context context, int iterations, size_t length) const double epsilon = (precision==64) ? 1.0e-8 : 1.0e-4; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-openmp-target.cc b/Cxx11/nstream-openmp-target.cc index 8715962a8..6eb800e3e 100644 --- a/Cxx11/nstream-openmp-target.cc +++ b/Cxx11/nstream-openmp-target.cc @@ -165,6 +165,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-openmp.cc b/Cxx11/nstream-openmp.cc index f3ea9bbd8..1eb24321a 100644 --- a/Cxx11/nstream-openmp.cc +++ b/Cxx11/nstream-openmp.cc @@ -172,6 +172,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-raja.cc b/Cxx11/nstream-raja.cc index dcba4cbf2..f86ebaf33 100644 --- a/Cxx11/nstream-raja.cc +++ b/Cxx11/nstream-raja.cc @@ -175,6 +175,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-sycl-explicit.cc b/Cxx11/nstream-sycl-explicit.cc index ef2a0392b..aee3d8167 100644 --- a/Cxx11/nstream-sycl-explicit.cc +++ b/Cxx11/nstream-sycl-explicit.cc @@ -159,7 +159,7 @@ void run(sycl::queue & q, int iterations, size_t length) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - T ar(0); + double ar(0); T br(2); T cr(2); for (int i=0; i<=iterations; ++i) { @@ -176,6 +176,7 @@ void run(sycl::queue & q, int iterations, size_t length) const double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-sycl-usm.cc b/Cxx11/nstream-sycl-usm.cc index c92a52bc9..d56df67bf 100644 --- a/Cxx11/nstream-sycl-usm.cc +++ b/Cxx11/nstream-sycl-usm.cc @@ -147,7 +147,7 @@ void run(sycl::queue & q, int iterations, size_t length) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - T ar(0); + double ar(0); T br(2); T cr(2); for (int i=0; i<=iterations; ++i) { @@ -164,6 +164,7 @@ void run(sycl::queue & q, int iterations, size_t length) const double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index bc52e6649..f9a891407 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -140,7 +140,7 @@ void run(sycl::queue & q, int iterations, size_t length) /// Analyze and output results ////////////////////////////////////////////////////////////////////// - T ar(0); + double ar(0); T br(2); T cr(2); for (int i=0; i<=iterations; ++i) { @@ -157,6 +157,7 @@ void run(sycl::queue & q, int iterations, size_t length) const double epsilon(1.e-8); if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; diff --git a/Cxx11/nstream.cc b/Cxx11/nstream.cc index 5673d3cf6..d97765c48 100644 --- a/Cxx11/nstream.cc +++ b/Cxx11/nstream.cc @@ -148,6 +148,7 @@ int main(int argc, char * argv[]) double epsilon=1.e-8; if (std::fabs(ar-asum)/asum > epsilon) { std::cout << "Failed Validation on output array\n" + << std::setprecision(16) << " Expected checksum: " << ar << "\n" << " Observed checksum: " << asum << std::endl; std::cout << "ERROR: solution did not validate" << std::endl; From 12f34a29007cc353ce7420cdcadd1f78a6007dc1 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 18:57:26 -0700 Subject: [PATCH 229/245] fix Kokkos CUDA issues on POWER9+V100 (#425) * add Kokkos README * update CUDA and Kokkos related things * add missing CPPFLAGS --- Cxx11/Makefile | 2 +- common/KOKKOS.md | 17 +++++++++++++++++ common/make.defs.ibmp9nv | 6 +++--- 3 files changed, 21 insertions(+), 4 deletions(-) create mode 100644 common/KOKKOS.md diff --git a/Cxx11/Makefile b/Cxx11/Makefile index 596c87793..e97c3fd2c 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -195,7 +195,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h ifeq ($(PRK_KOKKOS_BACKEND),Cuda) %-kokkos: %-kokkos.cc prk_util.h - ${KOKKOSDIR}/bin/nvcc_wrapper $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@ + ${KOKKOSDIR}/bin/nvcc_wrapper $(CPPFLAGS) $(CUDAFLAGS) $< $(KOKKOSFLAG) -DUSE_KOKKOS -DPRK_KOKKOS_BACKEND=Cuda -o $@ else %-kokkos: %-kokkos.cc prk_util.h $(info PRK help: Set USE_PRK_KOKKOS_BACKEND={Threads,Serial,Cuda} when invoking make to not use OpenMP) diff --git a/common/KOKKOS.md b/common/KOKKOS.md new file mode 100644 index 000000000..4a069d0b4 --- /dev/null +++ b/common/KOKKOS.md @@ -0,0 +1,17 @@ +# Kokkos README + +## IBM POWER9 + NVIDIA V100 + +If you do not enable GPU arch >5, it fails at runtime. + +If you do not enable lambda support, `parallel_reduce` will not compile. + +``` +cmake .. -DKokkos_ENABLE_CUDA=True \ + -DCMAKE_CXX_COMPILER=$HOME/KOKKOS/git/bin/nvcc_wrapper \ + -DCMAKE_INSTALL_PREFIX=$HOME/KOKKOS/install-cuda \ + -DKokkos_ARCH_POWER9=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ENABLE_CUDA_LAMBDA=ON \ + && make -j install +``` diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv index 3fd2badf8..c222c0ce3 100644 --- a/common/make.defs.ibmp9nv +++ b/common/make.defs.ibmp9nv @@ -103,7 +103,7 @@ RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} -DUSE_INTEL_PSTL -I./pstl/include ${RANGEFLAG} -Wno-\#pragma-messages KOKKOSDIR=${HOME}/KOKKOS/install-cuda -KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos -ldl +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib64 -lkokkoscore RAJADIR= RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include @@ -117,5 +117,5 @@ CBLASFLAG=-DESSL # CUDA flags # # Linux w/ NVIDIA CUDA -NVCC=/usr/local/cuda-10.1/bin/nvcc -arch=sm_70 -CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda +NVCC=/usr/local/cuda-10.1/bin/nvcc +CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda -arch=sm_70 From e1c5652520fb4077b5fbeb8cbb127c12d4930ef2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 22:14:07 -0600 Subject: [PATCH 230/245] more NVCC fun --- common/make.defs.ibmp9nv | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv index c222c0ce3..0d59c4c3c 100644 --- a/common/make.defs.ibmp9nv +++ b/common/make.defs.ibmp9nv @@ -118,4 +118,7 @@ CBLASFLAG=-DESSL # # Linux w/ NVIDIA CUDA NVCC=/usr/local/cuda-10.1/bin/nvcc -CUDAFLAGS=-g -O3 -std=c++11 --expt-extended-lambda -arch=sm_70 +CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=--expt-extended-lambda +CUDAFLAGS+=-arch=sm_70 +CUDAFLAGS+=-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored From 78d47c0be844b3a347e879b9365e6058ca6753c2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 22:51:48 -0600 Subject: [PATCH 231/245] xlc suppress warning in OpenCL code 1500-029: (W) WARNING: subprogram cl::Platform::getDevices(cl_device_type, std::vector *) could not be inlined into cl::Context::Context(cl_device_type, cl_context_properties *, void (*)(const char *, const void *, ::size_t, void *), void *, cl_int *). --- common/make.defs.ibmp9nv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv index 0d59c4c3c..a9684645f 100644 --- a/common/make.defs.ibmp9nv +++ b/common/make.defs.ibmp9nv @@ -31,7 +31,7 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux OPENCLDIR=/usr -OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029 # # SYCL flags # From 15a4cd6763cc683bb33cd3dae2083874706a1a93 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 23:14:51 -0600 Subject: [PATCH 232/245] more IBM fixes, Boost stuff --- common/make.defs.ibmp9nv | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv index a9684645f..bd6d25d8c 100644 --- a/common/make.defs.ibmp9nv +++ b/common/make.defs.ibmp9nv @@ -15,7 +15,7 @@ CXX=xlc++_r -qlanglvl=extended1y # # Compiler flags # -DEFAULT_OPT_FLAGS=-O3 +DEFAULT_OPT_FLAGS=-O3 -qsuppress=1500-036 # # OpenMP flags # @@ -33,15 +33,6 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70 OPENCLDIR=/usr OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029 # -# SYCL flags -# -# triSYCL -# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... -SYCLDIR=./triSYCL -#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) -SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) -SYCLFLAG=-std=gnu++17 -I$(SYCLDIR)/include -DTRISYCL -# # OCCA # #OCCADIR=${HOME}/prk-repo/Cxx11/occa @@ -54,7 +45,7 @@ TBBFLAG=-DUSE_TBB -I${TBBDIR}/include -L${TBBLIBDIR} -ltbb # # Parallel STL, Boost, etc. # -BOOSTROOT=${HOME}/boost_1_71_0/include +BOOSTROOT=${HOME}/boost/libs BOOSTFLAG= BOOSTFLAG+=-I${BOOSTROOT}/circular_buffer/include BOOSTFLAG+=-I${BOOSTROOT}/compute/include @@ -109,6 +100,17 @@ RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} THRUSTDIR=/usr/local/cuda-10.1/targets/ppc64le-linux/include THRUSTFLAG=-I${THRUSTDIR} # +# SYCL flags +# +# triSYCL +# https://github.com/triSYCL/triSYCL is header-only so just clone in Cxx11 directory... +SYCLDIR=./triSYCL +#SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) +#SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) +SYCLCXX=g++ -O3 -std=c++17 +SYCLFLAG=-I$(SYCLDIR)/include -DTRISYCL +SYCLFLAG+=$(BOOSTFLAG) +# # CBLAS for C++ DGEMM # BLASFLAG=-DESSL From 70d04eb2706e7b1b5e418d1ec44abd01ae38ea1c Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 23:25:27 -0600 Subject: [PATCH 233/245] change stdlib check from error to warning --- Cxx11/prk_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cxx11/prk_util.h b/Cxx11/prk_util.h index ed798b0b1..a6a9af3c2 100644 --- a/Cxx11/prk_util.h +++ b/Cxx11/prk_util.h @@ -41,7 +41,7 @@ // Test standard library _after_ standard headers have been included... #if !defined(__NVCC__) && !defined(__PGI) && !defined(__ibmxl__) && (defined(__GLIBCXX__) || defined(_GLIBCXX_RELEASE) ) && !defined(_GLIBCXX_USE_CXX11_ABI) -# error You are using an ancient version GNU libstdc++. Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option. +# warning You are using an ancient version GNU libstdc++. Either upgrade your GCC or tell ICC to use a newer version via the -gxx-name= option. #endif #if !(defined(__cplusplus) && (__cplusplus >= 201103L)) From f4a174013ff70648e39268d404bfe69beaae4adb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 23:26:01 -0600 Subject: [PATCH 234/245] disable boost-compute by default; use CXX to build OpenCL --- Cxx11/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index e97c3fd2c..a40207f76 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -69,7 +69,7 @@ ifneq ($(findstring pgc++,$(CXX)),pgc++) EXTRA += tbb pstl endif -all: sequential vector valarray openmp taskloop stl rangefor opencl sycl boost-compute $(EXTRA) +all: sequential vector valarray openmp taskloop stl rangefor opencl sycl $(EXTRA) #p2p: p2p-vector p2p-doacross-openmp p2p-hyperplane-openmp p2p-tasks-openmp p2p-openmp-target \ p2p-innerloop-vector-tbb p2p-vector-raja p2p-vector-tbb p2p-innerloop-opencl p2p-hyperplane-vector-tbb \ @@ -154,7 +154,7 @@ nstream-opencl: nstream-opencl.cc nstream.cl prk_util.h prk_opencl.h $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-opencl: %-opencl.cc prk_util.h prk_opencl.h - $(SYCLCXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ + $(CXX) $(CXXFLAGS) $< $(OPENCLFLAGS) -o $@ %-sycl: %-sycl.cc prk_util.h $(SYCLCXX) $(CPPFLAGS) $(SYCLFLAGS) $< -o $@ From d897d6a5f15bcc8e635565da02331a8d633acedb Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sat, 19 Oct 2019 23:27:43 -0600 Subject: [PATCH 235/245] change flags for OpenCL and SYCL --- common/make.defs.ibmp9nv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/make.defs.ibmp9nv b/common/make.defs.ibmp9nv index bd6d25d8c..e4ee52866 100644 --- a/common/make.defs.ibmp9nv +++ b/common/make.defs.ibmp9nv @@ -31,7 +31,7 @@ OFFLOADFLAG=-qoffload -qtgtarch=sm_70 #OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL # Linux OPENCLDIR=/usr -OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029 +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL -Wno-deprecated-declarations -qsuppress=1500-029 -qstrict # # OCCA # @@ -107,7 +107,7 @@ THRUSTFLAG=-I${THRUSTDIR} SYCLDIR=./triSYCL #SYCLCXX=${CXX} ${OPENMPFLAG} $(DEFAULT_OPT_FLAGS) #SYCLCXX=${CXX} $(DEFAULT_OPT_FLAGS) -SYCLCXX=g++ -O3 -std=c++17 +SYCLCXX=g++ -O3 -std=gnu++11 SYCLFLAG=-I$(SYCLDIR)/include -DTRISYCL SYCLFLAG+=$(BOOSTFLAG) # From 11dadb9f47cf80a2cd5a9ee2bdfe2df25995d83d Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 20 Oct 2019 10:52:48 -0700 Subject: [PATCH 236/245] RAJA docs (WIP) --- common/RAJA.md | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 common/RAJA.md diff --git a/common/RAJA.md b/common/RAJA.md new file mode 100644 index 000000000..3b5ce0e81 --- /dev/null +++ b/common/RAJA.md @@ -0,0 +1,11 @@ +# RAJA README + +## IBM POWER9 + NVIDIA V100 + +``` +cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/RAJA/install-cuda \ + -DCMAKE_CXX_COMPILER=xlc++_r -DCMAKE_C_COMPILER=xlc_r \ + -DENABLE_OPENMP=On -DENABLE_TARGET_OPENMP=On -DOpenMP_CXX_FLAGS="-qsmp -qoffload" \ + -DENABLE_CUDA=On -DCUDA_ARCH=sm_70 + && make -j install +``` From ac48fbcede5b216d5d29b4dfcb175652ded1aacc Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 20 Oct 2019 10:56:42 -0700 Subject: [PATCH 237/245] Update RAJA.md --- common/RAJA.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/RAJA.md b/common/RAJA.md index 3b5ce0e81..abba1b578 100644 --- a/common/RAJA.md +++ b/common/RAJA.md @@ -9,3 +9,6 @@ cmake .. -DCMAKE_INSTALL_PREFIX=$HOME/RAJA/install-cuda \ -DENABLE_CUDA=On -DCUDA_ARCH=sm_70 && make -j install ``` + +Optional extras: `-qsuppress=1500-030` or `-qmaxmem=-1` + From e584b4a23d590709aca441bb0e97c1ffe258529e Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 17 Dec 2019 09:13:14 -0800 Subject: [PATCH 238/245] multi-GPU CUBLAS DGEMM (#430) * add single-threaded multi-GPU CUBLAS * added MPI helper API in prk_mpi.h * add build system support for MPI+CUBLAS * add option to use specific number of GPUs CUBLAS multi-GPU support is weird. CUBLAS handles do not capture GPU device id. Tested and working on a 2 V100 x86 system. --- Cxx11/Makefile | 5 +- Cxx11/dgemm-mpi-cublas.cu | 272 ++++++++++++++++++++++++++ Cxx11/dgemm-multigpu-cublas.cu | 335 +++++++++++++++++++++++++++++++++ Cxx11/prk_cuda.h | 17 ++ Cxx11/prk_mpi.h | 115 +++++++++++ common/make.defs.cuda | 1 + 6 files changed, 744 insertions(+), 1 deletion(-) create mode 100644 Cxx11/dgemm-mpi-cublas.cu create mode 100644 Cxx11/dgemm-multigpu-cublas.cu create mode 100644 Cxx11/prk_mpi.h diff --git a/Cxx11/Makefile b/Cxx11/Makefile index a40207f76..bc2c8cd32 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -132,7 +132,7 @@ thrust: nstream-host-thrust nstream-device-thrust \ cuda: transpose-cuda -cublas: transpose-cublas nstream-cublas dgemm-cublas +cublas: transpose-cublas nstream-cublas dgemm-cublas dgemm-multigpu-cublas dgemm-mpi-cublas cblas: transpose-cblas dgemm-cblas @@ -213,6 +213,9 @@ endif %-cuda: %-cuda.cu prk_util.h prk_cuda.h $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -o $@ +%-mpi-cublas: %-mpi-cublas.cu prk_util.h prk_cuda.h prk_mpi.h + $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -lcublas $(MPIFLAGS) -o $@ + %-cublas: %-cublas.cu prk_util.h prk_cuda.h $(NVCC) $(CUDAFLAGS) $(CPPFLAGS) $< -lcublas -o $@ diff --git a/Cxx11/dgemm-mpi-cublas.cu b/Cxx11/dgemm-mpi-cublas.cu new file mode 100644 index 000000000..c78da594a --- /dev/null +++ b/Cxx11/dgemm-mpi-cublas.cu @@ -0,0 +1,272 @@ +/// +/// Copyright (c) 2018, Intel Corporation +/// +/// Redistribution and use in source and binary forms, with or without +/// modification, are permitted provided that the following conditions +/// are met: +/// +/// * Redistributions of source code must retain the above copyright +/// notice, this list of conditions and the following disclaimer. +/// * Redistributions in binary form must reproduce the above +/// copyright notice, this list of conditions and the following +/// disclaimer in the documentation and/or other materials provided +/// with the distribution. +/// * Neither the name of Intel Corporation nor the names of its +/// contributors may be used to endorse or promote products +/// derived from this software without specific prior written +/// permission. +/// +/// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +/// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +/// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +/// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +/// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +/// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +/// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +/// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +/// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +/// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +/// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +/// POSSIBILITY OF SUCH DAMAGE. + +////////////////////////////////////////////////////////////////////// +/// +/// NAME: dgemm +/// +/// PURPOSE: This program tests the efficiency with which a dense matrix +/// dense multiplication is carried out +/// +/// USAGE: The program takes as input the matrix order, +/// the number of times the matrix-matrix multiplication +/// is carried out, and, optionally, a tile size for matrix +/// blocking +/// +/// <# iterations> +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblasDgemm() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" +#include "prk_mpi.h" + +__global__ void init(int order, double * A, double * B, double * C) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + if ((i "; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + if (me == 0) { + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + } + + cublasHandle_t h; + prk::CUDA::check( cublasCreate(&h) ); + + const int tile_size = 32; + dim3 dimGrid(prk::divceil(order,tile_size),prk::divceil(order,tile_size),1); + dim3 dimBlock(tile_size, tile_size, 1); + + cuda.checkDims(dimBlock, dimGrid); + + ////////////////////////////////////////////////////////////////////// + // Allocate space for matrices + ////////////////////////////////////////////////////////////////////// + + double dgemm_time(0); + + const size_t nelems = (size_t)order * (size_t)order; + const size_t bytes = nelems * sizeof(double); + + // host buffers + double * h_c; + prk::CUDA::check( cudaMallocHost((void**)&h_c, bytes) ); + + // device buffers + double * d_a; + double * d_b; + double * d_c; + prk::CUDA::check( cudaMalloc((void**)&d_a, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_b, bytes) ); + prk::CUDA::check( cudaMalloc((void**)&d_c, bytes) ); + + init<<>>(order, d_a, d_b, d_c); + + { + for (auto iter = 0; iter<=iterations; iter++) { + + if (iter==1) { + prk::MPI::barrier(); + dgemm_time = prk::wtime(); + } + + double alpha = 1.0; + double beta = 1.0; + prk::CUDA::check( cublasDgemm(h, + CUBLAS_OP_N, CUBLAS_OP_N, // opA, opB + order, order, order, // m, n, k + &alpha, // alpha + d_a, order, // A, lda + d_b, order, // B, ldb + &beta, // beta + d_c, order) ); // C, ldc + + prk::CUDA::check( cudaDeviceSynchronize() ); + } + prk::MPI::barrier(); + dgemm_time = prk::wtime() - dgemm_time; + } + + // copy output back to host + prk::CUDA::check( cudaMemcpyAsync(&(h_c[0]), d_c, bytes, cudaMemcpyDeviceToHost) ); + + prk::CUDA::check( cudaFree(d_c) ); + prk::CUDA::check( cudaFree(d_b) ); + prk::CUDA::check( cudaFree(d_a) ); + + prk::CUDA::check( cublasDestroy(h) ); + + prk::CUDA::check( cudaDeviceSynchronize() ); + + ////////////////////////////////////////////////////////////////////// + /// Analyze and output results + ////////////////////////////////////////////////////////////////////// + + const double epsilon = 1.0e-8; + const double forder = static_cast(order); + const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); + double residuum(0); + const auto checksum = prk::reduce( &(h_c[0]), &(h_c[nelems]), 0.0); + residuum += std::abs(checksum-reference)/reference; + + // take the global max to make sure everyone passes... + residuum = prk::MPI::max(residuum); + +#ifndef VERBOSE + if (residuum >= epsilon) +#endif + { + for (int r=0; r <# iterations> [] +/// +/// The output consists of diagnostics to make sure the +/// algorithm worked, and of timing statistics. +/// +/// FUNCTIONS CALLED: +/// +/// Other than OpenMP or standard C functions, the following +/// functions are used in this program: +/// +/// cblasDgemm() +/// cublasDgemmStridedBatched() +/// +/// HISTORY: Written by Rob Van der Wijngaart, February 2009. +/// Converted to C++11 by Jeff Hammond, December, 2017. +/// +////////////////////////////////////////////////////////////////////// + +#include "prk_util.h" +#include "prk_cuda.h" + +__global__ void init(int order, const int matrices, double * A, double * B, double * C) +{ + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + for (int b=0; b [] []"; + } + + iterations = std::atoi(argv[1]); + if (iterations < 1) { + throw "ERROR: iterations must be >= 1"; + } + + order = std::atoi(argv[2]); + if (order <= 0) { + throw "ERROR: Matrix Order must be greater than 0"; + } else if (order > std::floor(std::sqrt(INT_MAX))) { + throw "ERROR: matrix dimension too large - overflow risk"; + } + + if (argc>3) { + batches = std::atoi(argv[3]); + } + + if (argc>4) { + use_ngpu = std::atoi(argv[4]); + } + } + catch (const char * e) { + std::cout << e << std::endl; + return 1; + } + + std::cout << "Number of iterations = " << iterations << std::endl; + std::cout << "Matrix order = " << order << std::endl; + if (batches == 0) { + std::cout << "No batching" << std::endl; + } else if (batches < 0) { + std::cout << "Batch size = " << -batches << " (loop over legacy BLAS)" << std::endl; + } else if (batches > 0) { + std::cout << "Batch size = " << batches << " (batched BLAS)" << std::endl; + } + std::cout << "Number of GPUs to use = " << use_ngpu << std::endl; + + int haz_ngpu = info.num_gpus(); + std::cout << "Number of GPUs found = " << haz_ngpu << std::endl; + + if (use_ngpu > haz_ngpu) { + std::cout << "You cannot use more GPUs (" << use_ngpu << ") than you have (" << haz_ngpu << ")" << std::endl; + } + + int ngpus = use_ngpu; + + std::vector contexts(ngpus); + for (int i=0; i h_c(ngpus,nullptr); + for (int i=0; i d_a(ngpus,nullptr); + std::vector d_b(ngpus,nullptr); + std::vector d_c(ngpus,nullptr); + for (int i=0; i>>(order, matrices, d_a[i], d_b[i], d_c[i]); + } + for (int i=0; i 0) { + prk_bgemm(contexts[i], order, matrices, d_a[i], d_b[i], d_c[i]); + } + } + for (int i=0; i(order); + const double reference = 0.25 * std::pow(forder,3) * std::pow(forder-1.0,2) * (iterations+1); + + double residuum(0); + for (int i=0; i +#include +#include +#include + +namespace prk +{ + namespace MPI + { + void check(int errorcode) + { + if (errorcode==MPI_SUCCESS) { + return; + } else { + int resultlen; + + char errorcode_string[MPI_MAX_ERROR_STRING]; + char errorclass_string[MPI_MAX_ERROR_STRING]; + + int errorclass; + MPI_Error_class(errorcode, &errorclass); + + MPI_Error_string(errorclass, errorclass_string, &resultlen); + std::cerr << "MPI error: class " << errorclass << ", " << errorclass_string << std::endl; + + MPI_Error_string(errorcode, errorcode_string, &resultlen); + std::cerr << "MPI error: code " << errorcode << ", " << errorcode_string << std::endl; + + MPI_Abort(MPI_COMM_WORLD, errorcode); + std::abort(); // unreachable + } + } + + class state { + + public: + state(void) { + int is_init, is_final; + MPI_Initialized(&is_init); + MPI_Finalized(&is_final); + if (!is_init && !is_final) { + MPI_Init(NULL,NULL); + } + } + + state(int argc, char** argv) { + int is_init, is_final; + MPI_Initialized(&is_init); + MPI_Finalized(&is_final); + if (!is_init && !is_final) { + MPI_Init(&argc,&argv); + } + } + + ~state(void) { + int is_init, is_final; + MPI_Initialized(&is_init); + MPI_Finalized(&is_final); + if (is_init && !is_final) { + MPI_Finalize(); + } + } + + }; + + int rank(MPI_Comm comm = MPI_COMM_WORLD) { + int rank; + prk::MPI::check( MPI_Comm_rank(comm,&rank) ); + return rank; + } + + int size(MPI_Comm comm = MPI_COMM_WORLD) { + int size; + prk::MPI::check( MPI_Comm_size(comm,&size) ); + return size; + } + + void barrier(MPI_Comm comm = MPI_COMM_WORLD) { + prk::MPI::check( MPI_Barrier(comm) ); + } + + double min(double in, MPI_Comm comm = MPI_COMM_WORLD) { + double out; + prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MIN, comm) ); + return out; + } + + double max(double in, MPI_Comm comm = MPI_COMM_WORLD) { + double out; + prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_MAX, comm) ); + return out; + } + + double avg(double in, MPI_Comm comm = MPI_COMM_WORLD) { + double out; + prk::MPI::check( MPI_Allreduce(&in, &out, 1, MPI_DOUBLE, MPI_SUM, comm) ); + out /= prk::MPI::size(comm); + return out; + } + + void stats(double in, double * min, double * max, double * avg, MPI_Comm comm = MPI_COMM_WORLD) { + prk::MPI::check( MPI_Allreduce(&in, min, 1, MPI_DOUBLE, MPI_MIN, comm) ); + prk::MPI::check( MPI_Allreduce(&in, max, 1, MPI_DOUBLE, MPI_MAX, comm) ); + prk::MPI::check( MPI_Allreduce(&in, avg, 1, MPI_DOUBLE, MPI_SUM, comm) ); + *avg /= prk::MPI::size(comm); + } + + } // MPI namespace + +} // prk namespace + +#endif // PRK_MPI_HPP diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 0f5fafb75..9b1188db9 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -132,6 +132,7 @@ CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED # # We assume you have installed an implementation of MPI-3 that is in your path. MPICC=mpicc +MPIFLAGS=-lmpi # # Fortran 2008 coarrays # From 6eb5b36f65fce2c54eaf9cbbb70e97e8aaf8a320 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 15 Jan 2020 09:26:41 -0800 Subject: [PATCH 239/245] more NVCC fun (#431) From 93d58c2abc28ad560dcb079140a1bfacd5b2ad66 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 22 Jan 2020 13:38:35 -0800 Subject: [PATCH 240/245] oneAPI make.defs (#432) --- common/make.defs.oneapi | 116 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 common/make.defs.oneapi diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi new file mode 100644 index 000000000..4786d23ce --- /dev/null +++ b/common/make.defs.oneapi @@ -0,0 +1,116 @@ +# +# This file shows the Intel toolchain options for PRKs using +# OpenMP, MPI and/or Fortran coarrays only. +# +# Base compilers and language options +# +# We assume you have Intel MPI and have setup your environment with e.g. +# . /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh intel64 +# in your .bashrc. +# +# C99 is required in some implementations. +CC=icx -std=c11 -pthread +#EXTRA_CLIBS=-lrt +# All of the Fortran code is written for the 2008 standard and requires preprocessing. +FC=ifx -fpp +# C++11 may not be required but does no harm here. +CXX=icpx -std=gnu++17 -pthread +# +# Compiler flags +# +# -xHOST is appropriate for most cases. +DEFAULT_OPT_FLAGS=-g -O3 -xHOST +# +# If you are compiling for KNL on a Xeon login node, use the following: +# DEFAULT_OPT_FLAGS=-g -O3 -xMIC-AVX512 +# +#DEFAULT_OPT_FLAGS+=-qopt-report=5 +# +# OpenMP flags +# +OPENMPFLAG=-fiopenmp +OPENMPSIMDFLAG=-fiopenmp +OFFLOADFLAG=-fopenmp-targets=spir64 +# +# OpenCL flags +# +# MacOS +#OPENCLFLAG=-framework OpenCL +# POCL +# http://portablecl.org/docs/html/using.html#linking-your-program-directly-with-pocl is not correct... +#OPENCLFLAG=-I/opt/pocl/latest/include -L/opt/pocl/latest/lib -lpoclu -I/opt/pocl/latest/share/pocl/include -lOpenCL +# Linux +OPENCLDIR=/etc/alternatives/opencl-intel-tools +OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL +# +# SYCL flags +# +SYCLCXX=dpcpp +SYCLFLAG=-fsycl -fsycl-unnamed-lambda +SYCLFLAG+=-std=c++17 -O3 +# +# +# OCCA +# +#OCCADIR=${HOME}/prk-repo/Cxx11/occa +# +# TBB +# +TBBFLAG=-tbb +#TBBFLAG=-tbb_preview -DTBB_PREVIEW_FLOW_GRAPH_TRACE +# +# Parallel STL, Boost, etc. +# +BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include +RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} +#RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include +PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG} +KOKKOSDIR=/opt/kokkos/intel +KOKKOSFLAG=-I${KOKKOSDIR}/include -L${KOKKOSDIR}/lib -lkokkos ${OPENMPFLAG} -ldl +RAJADIR=/opt/raja/intel +RAJAFLAG=-I${RAJADIR}/include -L${RAJADIR}/lib -lRAJA ${OPENMPFLAG} ${TBBFLAG} +THRUSTDIR=/opt/nvidia/thrust +THRUSTFLAG=-I${THRUSTDIR} ${RANGEFLAG} +# +# CBLAS for C++ DGEMM +# +#CBLASFLAG=-DACCELERATE -framework Accelerate -flax-vector-conversions +CBLASFLAG=-DMKL -mkl +# +# CUDA flags +# +# Mac w/ CUDA emulation via https://github.com/hughperkins/coriander +#NVCC=/opt/llvm/cocl/bin/cocl +# Linux w/ NVIDIA CUDA +NVCC=nvcc +CUDAFLAGS=-g -O3 -std=c++11 +CUDAFLAGS+=-arch=sm_50 +# https://github.com/tensorflow/tensorflow/issues/1066#issuecomment-200574233 +CUDAFLAGS+=-D_MWAITXINTRIN_H_INCLUDED +# +# ISPC +# +ISPC=ispc +ISPCFLAG=-O3 --target=host --opt=fast-math +# +# MPI +# +# We assume you have Intel MPI and have setup your environment with e.g. +# . /opt/intel/compilers_and_libraries/linux/mpi/intel64/bin/mpivars.sh +# in your .bashrc. +# +# mpiicc wraps icc. mpicc and mpigcc wrap gcc. +MPICC=mpiicc -std=c99 +# +# Fortran 2008 coarrays +# +# see https://github.com/ParRes/Kernels/blob/master/FORTRAN/README.md for details +# single-node +COARRAYFLAG=-coarray +# multi-node +# COARRAYFLAG=-coarray=distributed +# +# MEMKIND (used in C1z) +# +MEMKINDDIR=/home/parallels/PRK/deps +MEMKINDFLAGS=-I${MEMKINDDIR}/include -L${MEMKINDDIR}/lib -lmemkind -Wl,-rpath=${MEMKINDDIR}/lib From f33b188bfbc69c03c7b6823bf8523637063b9187 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 22 Jan 2020 13:47:02 -0800 Subject: [PATCH 241/245] disable prebuilt SYCL kernels w/ DPC++ (#433) * disable use of pre-build kernels for DPC++ /tmp/nstream-sycl-b825e1.o: In function `cl::sycl::detail::program_impl::build(std::string const&)': nstream-sycl.cc:(.text._ZN2cl4sycl6detail12program_impl5buildERKSs[_ZN2cl4sycl6detail12program_impl5buildERKSs]+0x3bc): undefined reference to `cl::sycl::detail::ProgramManager::getProgramBuildLog(_pi_program* const&)' clang++: error: linker command failed with exit code 1 (use -v to see invocation) --- Cxx11/prk_sycl.h | 2 +- common/make.defs.oneapi | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Cxx11/prk_sycl.h b/Cxx11/prk_sycl.h index cdd18d211..6e54a35c0 100644 --- a/Cxx11/prk_sycl.h +++ b/Cxx11/prk_sycl.h @@ -9,7 +9,7 @@ namespace sycl = cl::sycl; // prebuilt kernels are not required/not fully supported on hipSYCL and triSYCL -#if defined(TRISYCL) || defined(__HIPSYCL__) +#if defined(TRISYCL) || defined(__HIPSYCL__) || defined(DPCPP) #define PREBUILD_KERNEL 0 #else #define PREBUILD_KERNEL 1 diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index 4786d23ce..b736557d3 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -48,6 +48,7 @@ OPENCLFLAG=-I${OPENCLDIR} -L${OPENCLDIR}/lib64 -lOpenCL SYCLCXX=dpcpp SYCLFLAG=-fsycl -fsycl-unnamed-lambda SYCLFLAG+=-std=c++17 -O3 +SYCLFLAG+=-DDPCPP # # # OCCA From 0c6957ea933a2c4b2c2072514f7ae619e5ea69f2 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Fri, 7 Feb 2020 18:45:49 -0600 Subject: [PATCH 242/245] improve ISPC w/ help from Jeff Amstutz (#434) Signed-off-by: Jeff Hammond --- C1z/transpose-ispc.c | 4 ++-- C1z/transpose.ispc | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/C1z/transpose-ispc.c b/C1z/transpose-ispc.c index 442a4f1e9..a659bad83 100644 --- a/C1z/transpose-ispc.c +++ b/C1z/transpose-ispc.c @@ -55,7 +55,7 @@ #include "prk_util.h" -int ispc_num_threads(void); +//int ispc_num_threads(void); void initialize(const int order, double A[], double B[]); void transpose(const int order, double A[], double B[]); void transpose_tiled(const int order, double A[], double B[], const int tile_size); @@ -93,7 +93,7 @@ int main(int argc, char * argv[]) // a negative tile size means no tiling of the local transpose if (tile_size <= 0) tile_size = order; - printf("ISPC threads = %d\n", ispc_num_threads()); + //printf("ISPC threads = %d\n", ispc_num_threads()); printf("Number of iterations = %d\n", iterations); printf("Matrix order = %d\n", order); printf("Tile size = %d\n", tile_size); diff --git a/C1z/transpose.ispc b/C1z/transpose.ispc index 086709d57..d2ff472fb 100644 --- a/C1z/transpose.ispc +++ b/C1z/transpose.ispc @@ -19,16 +19,14 @@ export void initialize(uniform const int order, } } -#if 0 +#if 1 export void transpose(uniform const int order, uniform double A[], uniform double B[]) { - foreach (i = 0 ... order) { - for (uniform int j=0;j Date: Sun, 9 Feb 2020 17:44:26 -0800 Subject: [PATCH 243/245] Create SYCL.md --- common/SYCL.md | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 common/SYCL.md diff --git a/common/SYCL.md b/common/SYCL.md new file mode 100644 index 000000000..f0edf26f1 --- /dev/null +++ b/common/SYCL.md @@ -0,0 +1,70 @@ +# How to Install SYCL + +## triSYCL + +See https://github.com/triSYCL/triSYCL. This is a header-only implementation, so you can use +any C++17 compiler (C++14 might be sufficient). You need Boost, while OpenMP or TBB are optional +for threaded parallelism on the CPU. + +## CodePlay ComputeCpp + +See https://www.codeplay.com/products/computesuite/computecpp. + +## Intel Data Parallel C++ + +This comes in two flavors. You can compile the open-source version on GitHub and use `clang++ -fsycl`, +or you can install oneAPI and use the `dpcpp` driver, which is a wrapper around `clang++ -fsycl`. + +### oneAPI Download + +See https://software.intel.com/en-us/articles/installation-guide-for-intel-oneapi-toolkits. + +### Linux packages + +See https://software.intel.com/en-us/articles/oneapi-repo-instructions. + +### Build from source + +See https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedWithSYCLCompiler.md for details. + +The following is my automation once the repo is cloned. + +```sh +#!/bin/bash + +export SYCL_HOME=$HOME/ISYCL + +#cd $SYCL_HOME/llvm && time git checkout usmapi && time git pull +cd $SYCL_HOME/llvm && time git checkout sycl && time git pull + +rm -rf $SYCL_HOME/build + +mkdir -p $SYCL_HOME/build && \ + cd $SYCL_HOME/build && \ + time cmake \ + -DCMAKE_INSTALL_PREFIX=/opt/isycl \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_PROJECTS="clang;llvm-spirv;sycl" \ + -DLLVM_EXTERNAL_PROJECTS="llvm-spirv;sycl" \ + -DLLVM_EXTERNAL_SYCL_SOURCE_DIR=$SYCL_HOME/llvm/sycl \ + -DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR=$SYCL_HOME/llvm/llvm-spirv \ + -DLLVM_TOOL_SYCL_BUILD=ON \ + -DLLVM_TOOL_LLVM_SPIRV_BUILD=ON \ + $SYCL_HOME/llvm/llvm && \ + +time make -j4 sycl-toolchain + +time make -j4 sycl-toolchain install #DESTDIR=/opt/isycl +``` + +## hipSYCL + +See https://github.com/illuhad/hipSYCL/tree/master/doc for other options. + +### Spack + +https://github.com/spack/spack/pull/14051 is not merged yet but this works if you grab the PR. + +```sh +./bin/spack install hipsycl +cuda +``` From bdaa73871f8be5e69f7e2ef9fbd5bd20145749f5 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Sun, 9 Feb 2020 17:57:45 -0800 Subject: [PATCH 244/245] update docs --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7214b0a9e..02170261d 100644 --- a/README.md +++ b/README.md @@ -89,10 +89,10 @@ f = see footnotes | OpenMP tasks | y | y | y | y | | | | OpenMP target | y | y | y | y | | | | OpenCL 1.x | i | y | y | y | | | -| SYCL | | y | y | y | | | +| SYCL | i | y | y | y | | | | Boost.Compute | | | | y | | | | Parallel STL | y | y | y | y | | | -| Thrust | | | | y | | | +| Thrust | | | i | y | | | | TBB | y | y | y | y | | | | Kokkos | y | y | y | y | | | | RAJA | y | y | y | y | | | @@ -111,13 +111,19 @@ f = see footnotes | Parallelism | p2p | stencil | transpose | nstream | sparse | |----------------------|-----|---------|-----------|---------|--------| -| None | y | y | y | | | +| None | y | y | y | y | | | C11 threads | | | y | | | -| OpenMP | y | y | y | | | -| OpenMP tasks | y | y | y | | | -| OpenMP target | y | y | y | | | +| OpenMP | y | y | y | y | | +| OpenMP tasks | y | y | y | y | | +| OpenMP target | y | y | y | y | | | Cilk | | y | y | | | | ISPC | | | y | | | +| MPI | | | | y | | + +There are versions of nstream with OpenMP that support memory allocation +using [mmap](http://man7.org/linux/man-pages/man2/mmap.2.html) +and [memkind](https://github.com/memkind/memkind), which can be used +for testing novel memory systems, including persistent memory. * [ISPC](https://ispc.github.io/) From 465169dee3bc15a05c9d1efa394755b2b4d62749 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 10 Feb 2020 13:53:08 -0800 Subject: [PATCH 245/245] SYCL soft fail (#435) * remove MacOS Homebrew Boost path Signed-off-by: Jeff Hammond * SYCL codes shouldn't need ranges Signed-off-by: Jeff Hammond * do not exit on exceptions for device attempts Signed-off-by: Jeff Hammond --- Cxx11/Makefile | 2 +- Cxx11/nstream-sycl.cc | 31 ++++++++++++++++++++++++++----- common/make.defs.oneapi | 2 +- 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Cxx11/Makefile b/Cxx11/Makefile index bc2c8cd32..f40e0b8ac 100644 --- a/Cxx11/Makefile +++ b/Cxx11/Makefile @@ -46,7 +46,7 @@ PSTLFLAGS = $(PSTLFLAG) $(RANGEFLAGS) -DUSE_PSTL RAJAFLAGS = $(RAJAFLAG) -DUSE_RAJA THRUSTFLAGS = $(THRUSTFLAG) $(RANGEFLAGS) -DUSE_THRUST KOKKOSFLAGS = $(KOKKOSFLAG) $(KOKKOS_BACKEND_FLAG) $(RANGEFLAGS) -DUSE_KOKKOS -SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 $(RANGEFLAGS) +SYCLFLAGS = $(SYCLFLAG) -DUSE_SYCL -DUSE_2D_INDEXING=0 ORNLACCFLAGS = $(ORNLACCFLAG) ifdef OCCADIR diff --git a/Cxx11/nstream-sycl.cc b/Cxx11/nstream-sycl.cc index f9a891407..096a8f948 100644 --- a/Cxx11/nstream-sycl.cc +++ b/Cxx11/nstream-sycl.cc @@ -219,8 +219,8 @@ int main(int argc, char * argv[]) prk::opencl::listPlatforms(); #endif - try { #if SYCL_TRY_CPU_QUEUE + try { if (length<100000) { sycl::queue q(sycl::host_selector{}); prk::SYCL::print_device_platform(q); @@ -229,10 +229,22 @@ int main(int argc, char * argv[]) } else { std::cout << "Skipping host device since it is too slow for large problems" << std::endl; } + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + } #endif // CPU requires spir64 target #if SYCL_TRY_CPU_QUEUE + try { if (1) { sycl::queue q(sycl::cpu_selector{}); prk::SYCL::print_device_platform(q); @@ -242,10 +254,22 @@ int main(int argc, char * argv[]) run(q, iterations, length); } } + } + catch (sycl::exception & e) { + std::cout << e.what() << std::endl; + prk::SYCL::print_exception_details(e); + } + catch (std::exception & e) { + std::cout << e.what() << std::endl; + } + catch (const char * e) { + std::cout << e << std::endl; + } #endif // NVIDIA GPU requires ptx64 target #if SYCL_TRY_GPU_QUEUE + try { if (1) { sycl::queue q(sycl::gpu_selector{}); prk::SYCL::print_device_platform(q); @@ -262,21 +286,18 @@ int main(int argc, char * argv[]) } } } -#endif } catch (sycl::exception & e) { std::cout << e.what() << std::endl; prk::SYCL::print_exception_details(e); - return 1; } catch (std::exception & e) { std::cout << e.what() << std::endl; - return 1; } catch (const char * e) { std::cout << e << std::endl; - return 1; } +#endif return 0; } diff --git a/common/make.defs.oneapi b/common/make.defs.oneapi index b736557d3..edb4a8274 100644 --- a/common/make.defs.oneapi +++ b/common/make.defs.oneapi @@ -62,7 +62,7 @@ TBBFLAG=-tbb # # Parallel STL, Boost, etc. # -BOOSTFLAG=-I/usr/local/Cellar/boost/1.65.1/include +BOOSTFLAG= RANGEFLAG=-DUSE_BOOST_IRANGE ${BOOSTFLAG} #RANGEFLAG=-DUSE_RANGES_TS -I./range-v3/include PSTLFLAG=${OPENMPSIMDFLAG} ${TBBFLAG} ${RANGEFLAG}