Permalink
Browse files

static tiling (from ipdps17) ported. good?

  • Loading branch information...
ShadenSmith committed Feb 22, 2017
1 parent 5584c09 commit aeec200e563c52ff2ff97c55e4938acd47bb1faa
Showing with 449 additions and 68 deletions.
  1. +1 −1 LICENSE
  2. +26 −1 include/splatt/api_kernels.h
  3. +7 −1 src/bench.c
  4. +22 −6 src/ccp/ccp.c
  5. +5 −1 src/cpd.c
  6. +80 −0 src/csf.c
  7. +50 −0 src/csf.h
  8. +251 −55 src/mttkrp.c
  9. +2 −0 src/mttkrp.h
  10. +5 −3 tests/mttkrp_test.c
View
@@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (c) 2014-2016, Shaden Smith
Copyright (c) 2014-2017, Shaden Smith
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -3,14 +3,23 @@
* @brief Functions for performing tensor kernels (e.g., norm, MTTKRP, TTMc).
* @author Shaden Smith <shaden@cs.umn.edu>
* @version 2.0.0
* @date 2016-05-10
* @date 2017-02-22
*/
#ifndef SPLATT_SPLATT_KERNELS_H
#define SPLATT_SPLATT_KERNELS_H
typedef struct
{
splatt_idx_t num_csf;
splatt_idx_t mode_csf_map[SPLATT_MAX_NMODES];
splatt_idx_t * tree_partition[SPLATT_MAX_NMODES];
} splatt_mttkrp_ws;
/*
* KERNEL API
@@ -48,6 +57,22 @@ int splatt_mttkrp(
splatt_val_t * const matout,
double const * const options);
splatt_mttkrp_ws * splatt_mttkrp_alloc_ws(
splatt_csf const * const tensors,
splatt_idx_t const ncolumns,
double const * const options);
/**
* @brief Free the memory allocated for an MTTKRP workspace.
*
* @param ws The workspace to free.
*/
void splatt_mttkrp_free_ws(
splatt_mttkrp_ws * const ws);
/** @} */
View
@@ -179,12 +179,16 @@ void bench_csf(
printf("## THREADS %" SPLATT_PF_IDX "\n", nthreads);
}
cpd_opts[SPLATT_OPTION_NTHREADS] = nthreads;
splatt_mttkrp_ws * ws = splatt_mttkrp_alloc_ws(cs, nfactors, cpd_opts);
for(idx_t i=0; i < niters; ++i) {
timer_fstart(&itertime);
/* time each mode */
for(idx_t m=0; m < tt->nmodes; ++m) {
timer_fstart(&modetime);
mttkrp_csf(cs, mats, m, thds, cpd_opts);
mttkrp_csf(cs, mats, m, thds, ws, cpd_opts);
timer_stop(&modetime);
printf(" mode %" SPLATT_PF_IDX " %0.3fs\n", m+1, modetime.seconds);
if(opts->write && t == nruns-1 && i == 0) {
@@ -199,6 +203,8 @@ void bench_csf(
printf(" its = %3"SPLATT_PF_IDX" (%0.3fs)\n", i+1, itertime.seconds);
}
splatt_mttkrp_free_ws(ws);
/* output load balance info */
if(nruns > 1 || nthreads > 1) {
thd_times(thds, threads[nruns-1]);
View
@@ -1,5 +1,6 @@
/******************************************************************************
* INCLUDES
*****************************************************************************/
@@ -117,12 +118,27 @@ idx_t partition_1d(
nprobes = 0;
/* use recursive bisectioning with 0 tolerance to get exact solution */
idx_t bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0);
/* apply partitioning that we found */
bool success = lprobe(weights, nitems, parts, nparts, bottleneck);
assert(success == true);
idx_t bottleneck = 0;
/* actual partitioning */
if(nitems > nparts) {
/* use recursive bisectioning with 0 tolerance to get exact solution */
bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0);
/* apply partitioning that we found */
bool success = lprobe(weights, nitems, parts, nparts, bottleneck);
assert(success == true);
/* Do a trivial partitioning. Silly, but this happens when tensors have
* short modes. */
} else {
for(idx_t p=0; p < nitems; ++p) {
parts[p] = p;
bottleneck = SS_MAX(bottleneck, weights[p]);
}
for(idx_t p=nitems; p <= nparts; ++p) {
parts[p] = nitems;
}
}
timer_stop(&timers[TIMER_PART]);
return bottleneck;
View
@@ -300,6 +300,9 @@ double cpd_als_iterate(
/* used as buffer space */
aTa[MAX_NMODES] = mat_alloc(nfactors, nfactors);
/* mttkrp workspace */
splatt_mttkrp_ws * mttkrp_ws = splatt_mttkrp_alloc_ws(tensors,nfactors,opts);
/* Compute input tensor norm */
double oldfit = 0;
double fit = 0;
@@ -321,7 +324,7 @@ double cpd_als_iterate(
/* M1 = X * (C o B) */
timer_start(&timers[TIMER_MTTKRP]);
mttkrp_csf(tensors, mats, m, thds, opts);
mttkrp_csf(tensors, mats, m, thds, mttkrp_ws, opts);
timer_stop(&timers[TIMER_MTTKRP]);
#if 0
@@ -373,6 +376,7 @@ double cpd_als_iterate(
cpd_post_process(nfactors, nmodes, mats, lambda, thds, nthreads, rinfo);
/* CLEAN UP */
splatt_mttkrp_free_ws(mttkrp_ws);
for(idx_t m=0; m < nmodes; ++m) {
mat_free(aTa[m]);
}
View
@@ -5,6 +5,7 @@
#include "csf.h"
#include "sort.h"
#include "tile.h"
#include "ccp/ccp.h"
#include "io.h"
@@ -66,6 +67,39 @@ void splatt_free_csf(
* PRIVATE FUNCTIONS
*****************************************************************************/
/**
* @brief Count the nonzeros below a given node in a CSF tensor.
*
* @param fptr The adjacency pointer of the CSF tensor.
* @param nmodes The number of modes in the tensor.
* @param depth The depth of the node
* @param fiber The id of the node.
*
* @return The nonzeros below fptr[depth][fiber].
*/
idx_t p_csf_count_nnz(
idx_t * * fptr,
idx_t const nmodes,
idx_t depth,
idx_t const fiber)
{
if(depth == nmodes-1) {
return 1;
}
idx_t left = fptr[depth][fiber];
idx_t right = fptr[depth][fiber+1];
++depth;
for(; depth < nmodes-1; ++depth) {
left = fptr[depth][left];
right = fptr[depth][right];
}
return right - left;
}
/**
* @brief Find a permutation of modes that results in non-increasing mode size.
*
@@ -747,3 +781,49 @@ val_t csf_frobsq(
return (val_t) norm;
}
idx_t * csf_partition_1d(
splatt_csf const * const csf,
idx_t const tile_id,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));
idx_t const nslices = csf->pt[tile_id].nfibs[0];
idx_t * weights = splatt_malloc(nslices * sizeof(*weights));
#pragma omp parallel for schedule(static)
for(idx_t i=0; i < nslices; ++i) {
weights[i] = p_csf_count_nnz(csf->pt[tile_id].fptr, csf->nmodes, 0, i);
}
partition_1d(weights, nslices, parts, nparts);
splatt_free(weights);
return parts;
}
idx_t * csf_partition_tiles_1d(
splatt_csf const * const csf,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));
idx_t const nmodes = csf->nmodes;
idx_t const ntiles = csf->ntiles;
idx_t * weights = splatt_malloc(ntiles * sizeof(*weights));
#pragma omp parallel for schedule(static)
for(idx_t i=0; i < ntiles; ++i) {
weights[i] = csf->pt->nfibs[nmodes-1];
}
partition_1d(weights, ntiles, parts, nparts);
splatt_free(weights);
return parts;
}
View
@@ -182,4 +182,54 @@ static inline idx_t csf_depth_to_mode(
#define csf_partition_1d splatt_csf_partition_1d
/**
* @brief Split the root nodes of a CSF tensor into 'nparts' partitions.
*
* @param csf The CSF tensor to partition.
* @param nparts The number of partitions.
*
* @return An array of length (nparts+1) specifying the starts of each part.
*/
idx_t * csf_partition_1d(
splatt_csf const * const csf,
idx_t const tile_id,
idx_t const nparts);
#define csf_partition_tiles_1d splatt_csf_partition_tiles_1d
/**
* @brief Split the tiles of csf into 'nparts' partitions.
* NOTE: This does not account for any mode-ordering of the tiles, and
* instead treats them as a 1D resource.
*
* @param csf The tiled tensor to partition.
* @param nparts The number of partitions to compute.
*
* @return An array of length (nparts+1) specifying the starts of each part.
*/
idx_t * csf_partition_tiles_1d(
splatt_csf const * const csf,
idx_t const nparts);
#define csf_count_nnz splatt_csf_count_nnz
/**
* @brief Count the nonzeros below a given node in a CSF tensor.
*
* @param fptr The adjacency pointer of the CSF tensor.
* @param nmodes The number of modes in the tensor.
* @param depth The depth of the node
* @param fiber The id of the node.
*
* @return The nonzeros below fptr[depth][fiber].
*/
idx_t csf_count_nnz(
idx_t * * fptr,
idx_t const nmodes,
idx_t depth,
idx_t const fiber);
#endif
Oops, something went wrong.

0 comments on commit aeec200

Please sign in to comment.