Permalink
Browse files

Parallel CSF construction (#31)

`ccp/` is moved to `thread_partition.c` and CSF construction is parallelized.
  • Loading branch information...
ShadenSmith committed Nov 25, 2017
1 parent 97a07a4 commit 99fb5f8a27b75005fc7b9bd0d4aaa0ff5159dec3
Showing with 411 additions and 230 deletions.
  1. +0 −58 src/ccp/ccp.h
  2. +152 −85 src/csf.c
  3. +72 −43 src/{ccp/ccp.c → thread_partition.c}
  4. +83 −0 src/thread_partition.h
  5. +104 −44 tests/{ccp_test.c → thread_partition_test.c}
View

This file was deleted.

Oops, something went wrong.
View
237 src/csf.c
@@ -5,7 +5,8 @@
#include "csf.h"
#include "sort.h"
#include "tile.h"
#include "ccp/ccp.h"
#include "util.h"
#include "thread_partition.h"
#include "io.h"
@@ -252,54 +253,91 @@ static void p_mk_outerptr(
{
idx_t const nnzstart = nnztile_ptr[tile_id];
idx_t const nnzend = nnztile_ptr[tile_id+1];
assert(nnzstart < nnzend);
idx_t const nnz = nnzend - nnzstart;
assert(nnzstart < nnzend);
/* grab sparsity pattern */
csf_sparsity * const pt = ct->pt + tile_id;
/* grap top-level indices */
idx_t const * const restrict ttind =
nnzstart + tt->ind[csf_depth_to_mode(ct, 0)];
/* count fibers */
idx_t nfibs = 1;
for(idx_t x=1; x < nnz; ++x) {
assert(ttind[x-1] <= ttind[x]);
if(ttind[x] != ttind[x-1]) {
++nfibs;
}
}
ct->pt[tile_id].nfibs[0] = nfibs;
assert(nfibs <= ct->dims[csf_depth_to_mode(ct, 0)]);
/* partition among threads */
int const nthreads = splatt_omp_get_max_threads();
idx_t * thread_parts = partition_simple(nnz, nthreads);
idx_t * thread_nfibs = splatt_malloc((nthreads+1) * sizeof(*thread_nfibs));
/* grab sparsity pattern */
csf_sparsity * const pt = ct->pt + tile_id;
/* Fibers are counted by differing indices -- count at least one fiber */
thread_nfibs[0] = 1;
pt->fptr[0] = splatt_malloc((nfibs+1) * sizeof(**(pt->fptr)));
if(ct->ntiles > 1) {
pt->fids[0] = splatt_malloc(nfibs * sizeof(**(pt->fids)));
} else {
pt->fids[0] = NULL;
}
#pragma omp parallel
{
int const tid = splatt_omp_get_thread_num();
idx_t const nnz_start = SS_MAX(thread_parts[tid], 1); /* skip first nz */
idx_t const nnz_end = thread_parts[tid+1];
/* count fibers in each thread's partition */
idx_t local_nfibs = 0;
for(idx_t x=nnz_start; x < nnz_end; ++x) {
assert(ttind[x-1] <= ttind[x]);
if(ttind[x] != ttind[x-1]) {
++local_nfibs;
}
}
thread_nfibs[tid+1] = local_nfibs; /* +1 for prefix sum */
#pragma omp barrier
#pragma omp single
{
/* prefix sum on # fibers */
for(int t=0; t < nthreads; ++t) {
thread_nfibs[t+1] += thread_nfibs[t];
}
idx_t const nfibs = thread_nfibs[nthreads];
ct->pt[tile_id].nfibs[0] = nfibs;
assert(nfibs <= ct->dims[csf_depth_to_mode(ct, 0)]);
pt->fptr[0] = splatt_malloc((nfibs+1) * sizeof(**(pt->fptr)));
/* only store top-level fids if we are tiling or there are gaps */
if((ct->ntiles > 1) || (tt->dims[csf_depth_to_mode(ct, 0)] != nfibs)) {
pt->fids[0] = splatt_malloc(nfibs * sizeof(**(pt->fids)));
pt->fids[0][0] = ttind[0];
} else {
pt->fids[0] = NULL;
}
idx_t * const restrict fp = pt->fptr[0];
idx_t * const restrict fi = pt->fids[0];
fp[0] = 0;
if(fi != NULL) {
fi[0] = ttind[0];
}
idx_t nfound = 1;
for(idx_t n=1; n < nnz; ++n) {
/* check for end of outer index */
if(ttind[n] != ttind[n-1]) {
if(fi != NULL) {
fi[nfound] = ttind[n];
pt->fptr[0][0] = 0;
pt->fptr[0][nfibs] = nnz;
} /* implied barrier */
idx_t * const restrict fp = pt->fptr[0];
idx_t * const restrict fi = pt->fids[0];
/* go back over non-zeros and mark fptr and fids */
idx_t nfound = thread_nfibs[tid];
if(fi == NULL) {
for(idx_t n=nnz_start; n < nnz_end; ++n) {
/* check for end of outer index */
if(ttind[n] != ttind[n-1]) {
fp[nfound++] = n;
}
}
} else {
for(idx_t n=nnz_start; n < nnz_end; ++n) {
/* check for end of outer index */
if(ttind[n] != ttind[n-1]) {
fi[nfound] = ttind[n];
fp[nfound++] = n;
}
}
fp[nfound++] = n;
}
}
} /* end omp parallel */
fp[nfibs] = nnz;
splatt_free(thread_parts);
splatt_free(thread_nfibs);
}
@@ -337,55 +375,86 @@ static void p_mk_fptr(
idx_t const * const restrict ttind =
nnzstart + tt->ind[csf_depth_to_mode(ct, mode)];
/* grab sparsity pattern */
csf_sparsity * const pt = ct->pt + tile_id;
/* we will edit this to point to the new fiber idxs instead of nnz */
idx_t * const restrict fprev = pt->fptr[mode-1];
/* first count nfibers */
idx_t nfibs = 0;
/* foreach 'slice' in the previous dimension */
for(idx_t s=0; s < pt->nfibs[mode-1]; ++s) {
++nfibs; /* one by default per 'slice' */
/* count fibers in current hyperplane*/
for(idx_t f=fprev[s]+1; f < fprev[s+1]; ++f) {
if(ttind[f] != ttind[f-1]) {
++nfibs;
/* partition among threads */
int const nthreads = splatt_omp_get_max_threads();
idx_t * thread_parts = partition_simple(pt->nfibs[mode-1], nthreads);
idx_t * thread_nfibs = splatt_malloc((nthreads+1) * sizeof(*thread_nfibs));
thread_nfibs[0] = 0;
#pragma omp parallel
{
int const tid = splatt_omp_get_thread_num();
idx_t const slice_start = thread_parts[tid];
idx_t const slice_end = thread_parts[tid+1];
/* first count nfibers */
/* foreach 'slice' in the previous dimension */
idx_t local_nfibs = 0;
for(idx_t s=slice_start; s < slice_end; ++s) {
++local_nfibs; /* one by default per 'slice' */
/* count fibers in current hyperplane*/
for(idx_t f=fprev[s]+1; f < fprev[s+1]; ++f) {
if(ttind[f] != ttind[f-1]) {
++local_nfibs;
}
}
}
}
pt->nfibs[mode] = nfibs;
pt->fptr[mode] = splatt_malloc((nfibs+1) * sizeof(**(pt->fptr)));
pt->fids[mode] = splatt_malloc(nfibs * sizeof(**(pt->fids)));
idx_t * const restrict fp = pt->fptr[mode];
idx_t * const restrict fi = pt->fids[mode];
fp[0] = 0;
/* now fill in fiber info */
idx_t nfound = 0;
for(idx_t s=0; s < pt->nfibs[mode-1]; ++s) {
idx_t const start = fprev[s]+1;
idx_t const end = fprev[s+1];
/* mark start of subtree */
fprev[s] = nfound;
fi[nfound] = ttind[start-1];
fp[nfound++] = start-1;
/* mark fibers in current hyperplane */
for(idx_t f=start; f < end; ++f) {
if(ttind[f] != ttind[f-1]) {
fi[nfound] = ttind[f];
fp[nfound++] = f;
thread_nfibs[tid+1] = local_nfibs; /* +1 for prefix sum */
idx_t const fprev_end = fprev[slice_end];
#pragma omp barrier
#pragma omp single
{
/* prefix sum on # fibers */
for(int t=0; t < nthreads; ++t) {
thread_nfibs[t+1] += thread_nfibs[t];
}
idx_t const nfibs = thread_nfibs[nthreads];
pt->nfibs[mode] = nfibs;
pt->fptr[mode] = splatt_malloc((nfibs+1) * sizeof(**(pt->fptr)));
pt->fptr[mode][0] = 0;
pt->fids[mode] = splatt_malloc(nfibs * sizeof(**(pt->fids)));
} /* implied barrier */
idx_t * const restrict fp = pt->fptr[mode];
idx_t * const restrict fi = pt->fids[mode];
/* now fill in fiber info */
idx_t nfound = thread_nfibs[tid];
for(idx_t s=slice_start; s < slice_end; ++s) {
idx_t const start = fprev[s]+1;
idx_t const end = (s == slice_end - 1) ? fprev_end : fprev[s+1];
/* mark start of subtree */
fprev[s] = nfound;
fi[nfound] = ttind[start-1];
fp[nfound++] = start-1;
/* mark fibers in current hyperplane */
for(idx_t f=start; f < end; ++f) {
if(ttind[f] != ttind[f-1]) {
fi[nfound] = ttind[f];
fp[nfound++] = f;
}
}
}
}
/* mark end of last hyperplane */
fprev[pt->nfibs[mode-1]] = nfibs;
fp[nfibs] = nnz;
/* mark end of last hyperplane */
if(tid == nthreads - 1) {
fprev[pt->nfibs[mode-1]] = thread_nfibs[nthreads];
fp[thread_nfibs[nthreads]] = nnz;
}
} /* end omp parallel */
splatt_free(thread_parts);
splatt_free(thread_nfibs);
}
@@ -416,9 +485,9 @@ static void p_csf_alloc_untiled(
pt->nfibs[nmodes-1] = ct->nnz;
pt->fids[nmodes-1] = splatt_malloc(ct->nnz * sizeof(**(pt->fids)));
pt->vals = splatt_malloc(ct->nnz * sizeof(*(pt->vals)));
memcpy(pt->fids[nmodes-1], tt->ind[csf_depth_to_mode(ct, nmodes-1)],
par_memcpy(pt->fids[nmodes-1], tt->ind[csf_depth_to_mode(ct, nmodes-1)],
ct->nnz * sizeof(**(pt->fids)));
memcpy(pt->vals, tt->vals, ct->nnz * sizeof(*(pt->vals)));
par_memcpy(pt->vals, tt->vals, ct->nnz * sizeof(*(pt->vals)));
/* setup a basic tile ptr for one tile */
idx_t nnz_ptr[2];
@@ -502,19 +571,19 @@ static void p_csf_alloc_densetile(
pt->nfibs[leaves] = ptnnz;
pt->fids[leaves] = splatt_malloc(ptnnz * sizeof(**(pt->fids)));
memcpy(pt->fids[leaves], tt->ind[csf_depth_to_mode(ct, leaves)] + startnnz,
par_memcpy(pt->fids[leaves], tt->ind[csf_depth_to_mode(ct, leaves)] + startnnz,
ptnnz * sizeof(**(pt->fids)));
pt->vals = splatt_malloc(ptnnz * sizeof(*(pt->vals)));
memcpy(pt->vals, tt->vals + startnnz, ptnnz * sizeof(*(pt->vals)));
par_memcpy(pt->vals, tt->vals + startnnz, ptnnz * sizeof(*(pt->vals)));
/* create fptr entries for the rest of the modes */
for(idx_t m=0; m < leaves; ++m) {
p_mk_fptr(ct, tt, t, nnz_ptr, m);
}
}
free(nnz_ptr);
splatt_free(nnz_ptr);
}
@@ -787,8 +856,6 @@ idx_t * csf_partition_1d(
idx_t const tile_id,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));
idx_t const nslices = csf->pt[tile_id].nfibs[0];
idx_t * weights = splatt_malloc(nslices * sizeof(*weights));
@@ -797,7 +864,8 @@ idx_t * csf_partition_1d(
weights[i] = p_csf_count_nnz(csf->pt[tile_id].fptr, csf->nmodes, 0, i);
}
partition_1d(weights, nslices, parts, nparts);
idx_t bneck;
idx_t * parts = partition_weighted(weights, nslices, nparts, &bneck);
splatt_free(weights);
return parts;
@@ -808,8 +876,6 @@ idx_t * csf_partition_tiles_1d(
splatt_csf const * const csf,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));
idx_t const nmodes = csf->nmodes;
idx_t const ntiles = csf->ntiles;
idx_t * weights = splatt_malloc(ntiles * sizeof(*weights));
@@ -819,7 +885,8 @@ idx_t * csf_partition_tiles_1d(
weights[i] = csf->pt[i].nfibs[nmodes-1];
}
partition_1d(weights, ntiles, parts, nparts);
idx_t bneck;
idx_t * parts = partition_weighted(weights, ntiles, nparts, &bneck);
splatt_free(weights);
return parts;
Oops, something went wrong.

0 comments on commit 99fb5f8

Please sign in to comment.