ParRes · jeffhammond · Jun 19, 2017 · Jun 19, 2017 · Jun 19, 2017 · Jun 19, 2017
@@ -27,9 +27,8 @@ env:
   # Optional: We do not need to test BUPC this thoroughly every time
   #- PRK_TARGET=allupc UPC_IMPL=bupc GASNET_CONDUIT=mpi PRK_FLAGS="-Wc,-O3"
   #- PRK_TARGET=allupc UPC_IMPL=bupc GASNET_CONDUIT=ofi PRK_FLAGS="-Wc,-O3"
-  # Chapel kernels are not merged yet.  Activate these when they are.
-  #- PRK_TARGET=allchapel CHPL_COMM=none
-  #- PRK_TARGET=allchapel CHPL_COMM=gasnet
+  - PRK_TARGET=allchapel CHPL_COMM=none
+  - PRK_TARGET=allchapel CHPL_COMM=gasnet
   # HPX-3 kernels are not merged yet.  Activate these when they are.
   #- PRK_TARGET=allhpx3
   # HPX-5 kernels are not merged yet.  Activate these when they are.

@@ -0,0 +1,48 @@
+include ../common/Chapel.defs
+
+CHPL      = ${CHAPEL_PATH}chpl
+CHPLFLAGS = --fast
+
+ifeq ($(CHPL_LLVM),llvm)
+    echo "CHPL_LLVM=llvm"
+    CHPLFLAGS += --llvm
+else
+    ifeq ($(CHPL_LLVM),system)
+	echo "CHPL_LLVM=system"
+	CHPLFLAGS += --llvm
+    endif
+endif
+
+
+.PHONY: serial bigthree allstencil all clean
+
+all: bigthree dgemm nstream sparse
+
+serial: p2p-serial-fast stencil-serial transpose-serial
+
+allstencil: stencil-serial stencil-opt stencil-defaultdist stencil-blockdist stencil-stencildist
+
+bigthree: p2p-serial-fast allstencil transpose-serial transpose
+
+pic: pic.chpl random_draw.h random_draw.c
+	$(CHPL) $(CHPLFLAGS) $< -o $@
+
+stencil-defaultdist: stencil.chpl
+	$(CHPL) $(CHPLFLAGS) $< -o $@
+
+stencil-blockdist: stencil.chpl
+	$(CHPL) $(CHPLFLAGS) $< -o $@
+
+stencil-stencildist: stencil.chpl
+	$(CHPL) $(CHPLFLAGS) $< -o $@
+
+%: %.chpl
+	$(CHPL) $(CHPLFLAGS) $< -o $@
+
+clean:
+	-rm -f p2p-serial-fast
+	-rm -f stencil-opt stencil-serial
+	-rm -f stencil-defaultdist stencil-blockdist stencil-stencildist
+	-rm -f transpose-serial transpose
+	-rm -f dgemm nstream pic sparse
+
@@ -0,0 +1,147 @@
+/*
+   Chapel's parallel DGEMM implementation
+
+   Contributed by Engin Kayraklioglu (GWU)
+*/
+use Time;
+use BlockDist;
+use RangeChunk;
+
+param PRKVERSION = "2.17";
+
+config type dtype = real;
+
+config param useBlockDist = false;
+
+config const order = 10,
+             epsilon = 1e-8,
+             iterations = 100,
+             blockSize = 0,
+             debug = false,
+             validate = true,
+             correctness = false; // being run in start_test
+
+
+// TODO current logic assumes order is divisible by blockSize. add that
+// check
+
+const vecRange = 0..#order;
+
+const matrixSpace = {vecRange, vecRange};
+const matrixDom = matrixSpace dmapped if useBlockDist then
+                      new dmap(new Block(boundingBox=matrixSpace)) else
+                      defaultDist;
+
+var A: [matrixDom] dtype,
+    B: [matrixDom] dtype,
+    C: [matrixDom] dtype;
+
+forall (i,j) in matrixDom {
+  A[i,j] = j;
+  B[i,j] = j;
+  C[i,j] = 0;
+}
+
+const nTasksPerLocale = here.maxTaskPar;
+
+if !correctness {
+  writeln("Chapel Dense matrix-matrix multiplication");
+  writeln("Max parallelism      =   ", nTasksPerLocale);
+  writeln("Matrix order         =   ", order);
+  writeln("Blocking factor      =   ", if blockSize>0 then blockSize:string else "N/A");
+  writeln("Number of iterations =   ", iterations);
+  writeln();
+}
+
+const refChecksum = (iterations+1) *
+    (0.25*order*order*order*(order-1.0)*(order-1.0));
+
+var t = new Timer();
+
+if blockSize == 0 {
+  for niter in 0..iterations {
+    if niter==1 then t.start();
+
+    forall (i,j) in matrixSpace do
+      for k in vecRange do
+        C[i,j] += A[i,k] * B[k,j];
+
+  }
+  t.stop();
+}
+else {
+  // task-local arrays are necessary for blocked implementation, so
+  // using explicit coforalls
+  coforall l in Locales with (ref t) {
+    on l {
+      const bVecRange = 0..#blockSize;
+      const blockDom = {bVecRange, bVecRange};
+      const localDom = matrixDom.localSubdomain();
+
+      coforall tid in 0..#nTasksPerLocale with (ref t) {
+        const myChunk = chunk(localDom.dim(2), nTasksPerLocale, tid);
+
+        var AA: [blockDom] dtype,
+            BB: [blockDom] dtype,
+            CC: [blockDom] dtype;
+
+        for niter in 0..iterations {
+          if tid==0 && niter==1 then t.start();
+
+          for (jj,kk) in {myChunk by blockSize, vecRange by blockSize} {
+            const jMax = min(jj+blockSize-1, myChunk.high);
+            const kMax = min(kk+blockSize-1, vecRange.high);
+            const jRange = 0..jMax-jj;
+            const kRange = 0..kMax-kk;
+
+            // instead of unbounded ranges I was using 0..#blockSize in
+            // perf tests, which was running fine with --fast, but
+            // causing unequal iter length errors without --fast(if the
+            // order is not divisible by
+            // blockSize*numLocales*nTasksPerLocale
+            for (jB, j) in zip(jj..jMax, 0..) do
+              for (kB, k) in zip(kk..kMax, 0..) do
+                BB[j,k] = B[kB,jB];
+
+            for ii in localDom.dim(1) by blockSize {
+              const iMax = min(ii+blockSize-1, localDom.dim(1).high);
+              const iRange = 0..iMax-ii;
+
+              for (iB, i) in zip(ii..iMax, 0..) do
+                for (kB, k) in zip(kk..kMax, 0..) do
+                  AA[i,k] = A[iB, kB];
+
+              local {
+                for cc in CC do
+                  cc = 0.0;
+
+                for (k,j,i) in {kRange, jRange, iRange} do
+                  CC[i,j] += AA[i,k] * BB[j,k];
+
+                for (iB, i) in zip(ii..iMax, 0..) do
+                  for (jB, j) in zip(jj..jMax, 0..) do
+                    C[iB,jB] += CC[i,j];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  t.stop();
+}
+
+if validate {
+  const checksum = + reduce C;
+  if abs(checksum-refChecksum)/refChecksum > epsilon then
+    halt("VALIDATION FAILED! Reference checksum = ", refChecksum,
+                           " Checksum = ", checksum);
+  else
+    writeln("Validation successful");
+}
+
+if !correctness {
+  const nflops = 2.0*(order**3);
+  const avgTime = t.elapsed()/iterations;
+  writeln("Rate(MFlop/s) = ", 1e-6*nflops/avgTime, " Time : ", avgTime);
+}
@@ -0,0 +1,95 @@
+/*
+   Chapel's parallel Nstream implementation
+
+   Contributed by Engin Kayraklioglu (GWU)
+*/
+
+use Time;
+use BlockDist;
+
+param PRKVERSION = "2.17";
+
+config param useBlockDist = false;
+
+config const iterations = 100,
+             length = 100,
+             correctness = false, //being run in start_test
+             validate = true;
+
+config var SCALAR = 3.0;
+
+//
+// Process and test input configs
+//
+if iterations < 1 then
+  halt("ERROR: iterations must be >= 1: ", iterations);
+
+if length < 0 then
+  halt("ERROR: vector length must be >= 1: ", length);
+
+// Domains
+const space = {0.. # length};
+const vectorDom = space dmapped if useBlockDist then
+                            new dmap(new Block(boundingBox=space)) else
+                            defaultDist;
+
+var A: [vectorDom] real,
+    B: [vectorDom] real,
+    C: [vectorDom] real;
+
+if !correctness {
+  writeln("Parallel Research Kernels version ", PRKVERSION);
+  writeln("Serial stream triad: A = B + SCALAR*C");
+  writeln("Max parallelism        = ", here.maxTaskPar);
+  writeln("Vector length          = ", length);
+  writeln("Number of iterations   = ", iterations);
+}
+
+// initialization
+A = 0.0;
+B = 2.0;
+C = 2.0;
+
+var timer = new Timer();
+
+//
+// Main loop
+//
+for iteration in 0..iterations {
+  if iteration == 1 then
+    timer.start(); //Start timer after a warmup lap
+
+  A += B+SCALAR*C;
+}
+
+// Timings
+timer.stop();
+var avgTime = timer.elapsed() / iterations;
+timer.clear();
+
+//
+// Analyze and output results
+//
+if validate {
+  config const epsilon = 1.e-8;
+
+  var aj=0.0, bj=2.0, cj=2.0;
+  for 0..iterations do
+    aj += bj+SCALAR*cj;
+
+  aj = aj * length:real;
+
+  var asum = + reduce A;
+
+  if abs(aj-asum)/asum <= epsilon then
+    writeln("Validation successful");
+  else
+    halt("VALIDATION FAILED! Reference checksum = ", aj,
+                           " Checksum = ", asum);
+}
+
+if !correctness {
+  const nbytes = 4 * 8 * length;
+  writeln("Rate (MB/s): ", 1.0E-06*nbytes/avgTime,
+         " Avg time (s): ",avgTime);
+}