Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New Chapel #439

Open
wants to merge 14 commits into
base: default
Choose a base branch
from
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ env:
# Optional: We do not need to test BUPC this thoroughly every time
#- PRK_TARGET=allupc UPC_IMPL=bupc GASNET_CONDUIT=mpi PRK_FLAGS="-Wc,-O3"
#- PRK_TARGET=allupc UPC_IMPL=bupc GASNET_CONDUIT=ofi PRK_FLAGS="-Wc,-O3"
# Chapel kernels are not merged yet. Activate these when they are.
#- PRK_TARGET=allchapel CHPL_COMM=none
#- PRK_TARGET=allchapel CHPL_COMM=gasnet
- PRK_TARGET=allchapel CHPL_COMM=none
- PRK_TARGET=allchapel CHPL_COMM=gasnet
# HPX-3 kernels are not merged yet. Activate these when they are.
#- PRK_TARGET=allhpx3
# HPX-5 kernels are not merged yet. Activate these when they are.
Expand Down
48 changes: 48 additions & 0 deletions CHAPEL/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
include ../common/Chapel.defs

CHPL = ${CHAPEL_PATH}chpl
CHPLFLAGS = --fast

ifeq ($(CHPL_LLVM),llvm)
echo "CHPL_LLVM=llvm"
CHPLFLAGS += --llvm
else
ifeq ($(CHPL_LLVM),system)
echo "CHPL_LLVM=system"
CHPLFLAGS += --llvm
endif
endif


.PHONY: serial bigthree allstencil all clean

all: bigthree dgemm nstream sparse

serial: p2p-serial-fast stencil-serial transpose-serial

allstencil: stencil-serial stencil-opt stencil-defaultdist stencil-blockdist stencil-stencildist

bigthree: p2p-serial-fast allstencil transpose-serial transpose

pic: pic.chpl random_draw.h random_draw.c
$(CHPL) $(CHPLFLAGS) $< -o $@

stencil-defaultdist: stencil.chpl
$(CHPL) $(CHPLFLAGS) $< -o $@

stencil-blockdist: stencil.chpl
$(CHPL) $(CHPLFLAGS) $< -o $@

stencil-stencildist: stencil.chpl
$(CHPL) $(CHPLFLAGS) $< -o $@

%: %.chpl
$(CHPL) $(CHPLFLAGS) $< -o $@

clean:
-rm -f p2p-serial-fast
-rm -f stencil-opt stencil-serial
-rm -f stencil-defaultdist stencil-blockdist stencil-stencildist
-rm -f transpose-serial transpose
-rm -f dgemm nstream pic sparse

147 changes: 147 additions & 0 deletions CHAPEL/dgemm.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*
Chapel's parallel DGEMM implementation

Contributed by Engin Kayraklioglu (GWU)
*/
use Time;
use BlockDist;
use RangeChunk;

param PRKVERSION = "2.17";

config type dtype = real;

config param useBlockDist = false;

config const order = 10,
epsilon = 1e-8,
iterations = 100,
blockSize = 0,
debug = false,
validate = true,
correctness = false; // being run in start_test


// TODO current logic assumes order is divisible by blockSize. add that
// check

const vecRange = 0..#order;

const matrixSpace = {vecRange, vecRange};
const matrixDom = matrixSpace dmapped if useBlockDist then
new dmap(new Block(boundingBox=matrixSpace)) else
defaultDist;

var A: [matrixDom] dtype,
B: [matrixDom] dtype,
C: [matrixDom] dtype;

forall (i,j) in matrixDom {
A[i,j] = j;
B[i,j] = j;
C[i,j] = 0;
}

const nTasksPerLocale = here.maxTaskPar;

if !correctness {
writeln("Chapel Dense matrix-matrix multiplication");
writeln("Max parallelism = ", nTasksPerLocale);
writeln("Matrix order = ", order);
writeln("Blocking factor = ", if blockSize>0 then blockSize:string else "N/A");
writeln("Number of iterations = ", iterations);
writeln();
}

const refChecksum = (iterations+1) *
(0.25*order*order*order*(order-1.0)*(order-1.0));

var t = new Timer();

if blockSize == 0 {
for niter in 0..iterations {
if niter==1 then t.start();

forall (i,j) in matrixSpace do
for k in vecRange do
C[i,j] += A[i,k] * B[k,j];

}
t.stop();
}
else {
// task-local arrays are necessary for blocked implementation, so
// using explicit coforalls
coforall l in Locales with (ref t) {
on l {
const bVecRange = 0..#blockSize;
const blockDom = {bVecRange, bVecRange};
const localDom = matrixDom.localSubdomain();

coforall tid in 0..#nTasksPerLocale with (ref t) {
const myChunk = chunk(localDom.dim(2), nTasksPerLocale, tid);

var AA: [blockDom] dtype,
BB: [blockDom] dtype,
CC: [blockDom] dtype;

for niter in 0..iterations {
if tid==0 && niter==1 then t.start();

for (jj,kk) in {myChunk by blockSize, vecRange by blockSize} {
const jMax = min(jj+blockSize-1, myChunk.high);
const kMax = min(kk+blockSize-1, vecRange.high);
const jRange = 0..jMax-jj;
const kRange = 0..kMax-kk;

// instead of unbounded ranges I was using 0..#blockSize in
// perf tests, which was running fine with --fast, but
// causing unequal iter length errors without --fast(if the
// order is not divisible by
// blockSize*numLocales*nTasksPerLocale
for (jB, j) in zip(jj..jMax, 0..) do
for (kB, k) in zip(kk..kMax, 0..) do
BB[j,k] = B[kB,jB];

for ii in localDom.dim(1) by blockSize {
const iMax = min(ii+blockSize-1, localDom.dim(1).high);
const iRange = 0..iMax-ii;

for (iB, i) in zip(ii..iMax, 0..) do
for (kB, k) in zip(kk..kMax, 0..) do
AA[i,k] = A[iB, kB];

local {
for cc in CC do
cc = 0.0;

for (k,j,i) in {kRange, jRange, iRange} do
CC[i,j] += AA[i,k] * BB[j,k];

for (iB, i) in zip(ii..iMax, 0..) do
for (jB, j) in zip(jj..jMax, 0..) do
C[iB,jB] += CC[i,j];
}
}
}
}
}
}
}
t.stop();
}

if validate {
const checksum = + reduce C;
if abs(checksum-refChecksum)/refChecksum > epsilon then
halt("VALIDATION FAILED! Reference checksum = ", refChecksum,
" Checksum = ", checksum);
else
writeln("Validation successful");
}

if !correctness {
const nflops = 2.0*(order**3);
const avgTime = t.elapsed()/iterations;
writeln("Rate(MFlop/s) = ", 1e-6*nflops/avgTime, " Time : ", avgTime);
}
95 changes: 95 additions & 0 deletions CHAPEL/nstream.chpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
Chapel's parallel Nstream implementation

Contributed by Engin Kayraklioglu (GWU)
*/

use Time;
use BlockDist;

param PRKVERSION = "2.17";

config param useBlockDist = false;

config const iterations = 100,
length = 100,
correctness = false, //being run in start_test
validate = true;

config var SCALAR = 3.0;

//
// Process and test input configs
//
if iterations < 1 then
halt("ERROR: iterations must be >= 1: ", iterations);

if length < 0 then
halt("ERROR: vector length must be >= 1: ", length);

// Domains
const space = {0.. # length};
const vectorDom = space dmapped if useBlockDist then
new dmap(new Block(boundingBox=space)) else
defaultDist;

var A: [vectorDom] real,
B: [vectorDom] real,
C: [vectorDom] real;

if !correctness {
writeln("Parallel Research Kernels version ", PRKVERSION);
writeln("Serial stream triad: A = B + SCALAR*C");
writeln("Max parallelism = ", here.maxTaskPar);
writeln("Vector length = ", length);
writeln("Number of iterations = ", iterations);
}

// initialization
A = 0.0;
B = 2.0;
C = 2.0;

var timer = new Timer();

//
// Main loop
//
for iteration in 0..iterations {
if iteration == 1 then
timer.start(); //Start timer after a warmup lap

A += B+SCALAR*C;
}

// Timings
timer.stop();
var avgTime = timer.elapsed() / iterations;
timer.clear();

//
// Analyze and output results
//
if validate {
config const epsilon = 1.e-8;

var aj=0.0, bj=2.0, cj=2.0;
for 0..iterations do
aj += bj+SCALAR*cj;

aj = aj * length:real;

var asum = + reduce A;

if abs(aj-asum)/asum <= epsilon then
writeln("Validation successful");
else
halt("VALIDATION FAILED! Reference checksum = ", aj,
" Checksum = ", asum);
}

if !correctness {
const nbytes = 4 * 8 * length;
writeln("Rate (MB/s): ", 1.0E-06*nbytes/avgTime,
" Avg time (s): ",avgTime);
}
Loading