Skip to content
Permalink
Browse files

static tiling (from ipdps17) ported. good?

  • Loading branch information
ShadenSmith committed Apr 20, 2017
1 parent 5584c09 commit aeec200e563c52ff2ff97c55e4938acd47bb1faa
Showing with 449 additions and 68 deletions.
  1. +1 −1 LICENSE
  2. +26 −1 include/splatt/api_kernels.h
  3. +7 −1 src/bench.c
  4. +22 −6 src/ccp/ccp.c
  5. +5 −1 src/cpd.c
  6. +80 −0 src/csf.c
  7. +50 −0 src/csf.h
  8. +251 −55 src/mttkrp.c
  9. +2 −0 src/mttkrp.h
  10. +5 −3 tests/mttkrp_test.c
@@ -1,6 +1,6 @@
The MIT License (MIT)

Copyright (c) 2014-2016, Shaden Smith
Copyright (c) 2014-2017, Shaden Smith

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
@@ -3,14 +3,23 @@
* @brief Functions for performing tensor kernels (e.g., norm, MTTKRP, TTMc).
* @author Shaden Smith <shaden@cs.umn.edu>
* @version 2.0.0
* @date 2016-05-10
* @date 2017-02-22
*/



#ifndef SPLATT_SPLATT_KERNELS_H
#define SPLATT_SPLATT_KERNELS_H

typedef struct
{
splatt_idx_t num_csf;

splatt_idx_t mode_csf_map[SPLATT_MAX_NMODES];

splatt_idx_t * tree_partition[SPLATT_MAX_NMODES];
} splatt_mttkrp_ws;


/*
* KERNEL API
@@ -48,6 +57,22 @@ int splatt_mttkrp(
splatt_val_t * const matout,
double const * const options);


splatt_mttkrp_ws * splatt_mttkrp_alloc_ws(
splatt_csf const * const tensors,
splatt_idx_t const ncolumns,
double const * const options);


/**
* @brief Free the memory allocated for an MTTKRP workspace.
*
* @param ws The workspace to free.
*/
void splatt_mttkrp_free_ws(
splatt_mttkrp_ws * const ws);


/** @} */


@@ -179,12 +179,16 @@ void bench_csf(
printf("## THREADS %" SPLATT_PF_IDX "\n", nthreads);
}

cpd_opts[SPLATT_OPTION_NTHREADS] = nthreads;

splatt_mttkrp_ws * ws = splatt_mttkrp_alloc_ws(cs, nfactors, cpd_opts);

for(idx_t i=0; i < niters; ++i) {
timer_fstart(&itertime);
/* time each mode */
for(idx_t m=0; m < tt->nmodes; ++m) {
timer_fstart(&modetime);
mttkrp_csf(cs, mats, m, thds, cpd_opts);
mttkrp_csf(cs, mats, m, thds, ws, cpd_opts);
timer_stop(&modetime);
printf(" mode %" SPLATT_PF_IDX " %0.3fs\n", m+1, modetime.seconds);
if(opts->write && t == nruns-1 && i == 0) {
@@ -199,6 +203,8 @@ void bench_csf(
printf(" its = %3"SPLATT_PF_IDX" (%0.3fs)\n", i+1, itertime.seconds);
}

splatt_mttkrp_free_ws(ws);

/* output load balance info */
if(nruns > 1 || nthreads > 1) {
thd_times(thds, threads[nruns-1]);
@@ -1,5 +1,6 @@



/******************************************************************************
* INCLUDES
*****************************************************************************/
@@ -117,12 +118,27 @@ idx_t partition_1d(

nprobes = 0;

/* use recursive bisectioning with 0 tolerance to get exact solution */
idx_t bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0);

/* apply partitioning that we found */
bool success = lprobe(weights, nitems, parts, nparts, bottleneck);
assert(success == true);
idx_t bottleneck = 0;

/* actual partitioning */
if(nitems > nparts) {
/* use recursive bisectioning with 0 tolerance to get exact solution */
bottleneck = p_eps_rb_partition_1d(weights, nitems, parts, nparts, 0);
/* apply partitioning that we found */
bool success = lprobe(weights, nitems, parts, nparts, bottleneck);
assert(success == true);

/* Do a trivial partitioning. Silly, but this happens when tensors have
* short modes. */
} else {
for(idx_t p=0; p < nitems; ++p) {
parts[p] = p;
bottleneck = SS_MAX(bottleneck, weights[p]);
}
for(idx_t p=nitems; p <= nparts; ++p) {
parts[p] = nitems;
}
}

timer_stop(&timers[TIMER_PART]);
return bottleneck;
@@ -300,6 +300,9 @@ double cpd_als_iterate(
/* used as buffer space */
aTa[MAX_NMODES] = mat_alloc(nfactors, nfactors);

/* mttkrp workspace */
splatt_mttkrp_ws * mttkrp_ws = splatt_mttkrp_alloc_ws(tensors,nfactors,opts);

/* Compute input tensor norm */
double oldfit = 0;
double fit = 0;
@@ -321,7 +324,7 @@ double cpd_als_iterate(

/* M1 = X * (C o B) */
timer_start(&timers[TIMER_MTTKRP]);
mttkrp_csf(tensors, mats, m, thds, opts);
mttkrp_csf(tensors, mats, m, thds, mttkrp_ws, opts);
timer_stop(&timers[TIMER_MTTKRP]);

#if 0
@@ -373,6 +376,7 @@ double cpd_als_iterate(
cpd_post_process(nfactors, nmodes, mats, lambda, thds, nthreads, rinfo);

/* CLEAN UP */
splatt_mttkrp_free_ws(mttkrp_ws);
for(idx_t m=0; m < nmodes; ++m) {
mat_free(aTa[m]);
}
@@ -5,6 +5,7 @@
#include "csf.h"
#include "sort.h"
#include "tile.h"
#include "ccp/ccp.h"

#include "io.h"

@@ -66,6 +67,39 @@ void splatt_free_csf(
* PRIVATE FUNCTIONS
*****************************************************************************/

/**
* @brief Count the nonzeros below a given node in a CSF tensor.
*
* @param fptr The adjacency pointer of the CSF tensor.
* @param nmodes The number of modes in the tensor.
* @param depth The depth of the node
* @param fiber The id of the node.
*
* @return The nonzeros below fptr[depth][fiber].
*/
idx_t p_csf_count_nnz(
idx_t * * fptr,
idx_t const nmodes,
idx_t depth,
idx_t const fiber)
{
if(depth == nmodes-1) {
return 1;
}

idx_t left = fptr[depth][fiber];
idx_t right = fptr[depth][fiber+1];
++depth;

for(; depth < nmodes-1; ++depth) {
left = fptr[depth][left];
right = fptr[depth][right];
}

return right - left;
}


/**
* @brief Find a permutation of modes that results in non-increasing mode size.
*
@@ -747,3 +781,49 @@ val_t csf_frobsq(
return (val_t) norm;
}


idx_t * csf_partition_1d(
splatt_csf const * const csf,
idx_t const tile_id,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));

idx_t const nslices = csf->pt[tile_id].nfibs[0];
idx_t * weights = splatt_malloc(nslices * sizeof(*weights));

#pragma omp parallel for schedule(static)
for(idx_t i=0; i < nslices; ++i) {
weights[i] = p_csf_count_nnz(csf->pt[tile_id].fptr, csf->nmodes, 0, i);
}

partition_1d(weights, nslices, parts, nparts);
splatt_free(weights);

return parts;
}


idx_t * csf_partition_tiles_1d(
splatt_csf const * const csf,
idx_t const nparts)
{
idx_t * parts = splatt_malloc((nparts+1) * sizeof(*parts));

idx_t const nmodes = csf->nmodes;
idx_t const ntiles = csf->ntiles;
idx_t * weights = splatt_malloc(ntiles * sizeof(*weights));

#pragma omp parallel for schedule(static)
for(idx_t i=0; i < ntiles; ++i) {
weights[i] = csf->pt->nfibs[nmodes-1];
}

partition_1d(weights, ntiles, parts, nparts);
splatt_free(weights);

return parts;
}



@@ -182,4 +182,54 @@ static inline idx_t csf_depth_to_mode(



#define csf_partition_1d splatt_csf_partition_1d
/**
* @brief Split the root nodes of a CSF tensor into 'nparts' partitions.
*
* @param csf The CSF tensor to partition.
* @param nparts The number of partitions.
*
* @return An array of length (nparts+1) specifying the starts of each part.
*/
idx_t * csf_partition_1d(
splatt_csf const * const csf,
idx_t const tile_id,
idx_t const nparts);


#define csf_partition_tiles_1d splatt_csf_partition_tiles_1d
/**
* @brief Split the tiles of csf into 'nparts' partitions.
* NOTE: This does not account for any mode-ordering of the tiles, and
* instead treats them as a 1D resource.
*
* @param csf The tiled tensor to partition.
* @param nparts The number of partitions to compute.
*
* @return An array of length (nparts+1) specifying the starts of each part.
*/
idx_t * csf_partition_tiles_1d(
splatt_csf const * const csf,
idx_t const nparts);


#define csf_count_nnz splatt_csf_count_nnz
/**
* @brief Count the nonzeros below a given node in a CSF tensor.
*
* @param fptr The adjacency pointer of the CSF tensor.
* @param nmodes The number of modes in the tensor.
* @param depth The depth of the node
* @param fiber The id of the node.
*
* @return The nonzeros below fptr[depth][fiber].
*/
idx_t csf_count_nnz(
idx_t * * fptr,
idx_t const nmodes,
idx_t depth,
idx_t const fiber);



#endif

0 comments on commit aeec200

Please sign in to comment.
You can’t perform that action at this time.