Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 72 additions & 38 deletions examples/cuda/ta_dense_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,23 +137,31 @@ template <typename Storage>
void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
const long Nn, const long Bn, const long Nk, const long Bk,
const long nrepeat) {
using Real = typename Storage::value_type;
using T = TiledArray::detail::numeric_t<Storage>;
using RT = TiledArray::detail::scalar_t<Storage>;
constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

const std::int64_t nflops =
(complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
* static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
static_cast<std::int64_t>(Nk);

if (world.rank() == 0)
std::cout << "TiledArray: dense matrix multiply test...\n"
<< "Number of nodes = " << world.size()
<< "\nSize of A = " << Nm << "x" << Nk << " ("
<< double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
<< double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of A block = " << Bm << "x" << Bk
<< "\nSize of B = " << Nk << "x" << Nn << " ("
<< double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
<< double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of B block = " << Bk << "x" << Bn
<< "\nSize of C = " << Nm << "x" << Nn << " ("
<< double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
<< double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of C block = " << Bm << "x" << Bn
<< "\n# of blocks of C = " << Tm * Tn
<< "\nAverage # of blocks of C/node = "
Expand Down Expand Up @@ -205,14 +213,13 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
TiledArray::TiledRange // TRange for b
trange_b(blocking_B.begin(), blocking_B.end());

using value_type = typename Storage::value_type;
using CUDATile = btas::Tensor<Real, TA::Range, Storage>;
using CUDATile = btas::Tensor<T, TA::Range, Storage>;
using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
using TAMatrix = TA::DistArray<TA::Tensor<value_type>>;
using TAMatrix = TA::DistArray<TA::Tensor<T>>;

CUDAMatrix c(world, trange_c);
value_type val_a = 0.03;
value_type val_b = 0.02;
auto val_a = 0.03;
auto val_b = 0.02;

{
// Construct and initialize arrays
Expand All @@ -235,19 +242,26 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
// start profiler
cudaProfilerStart();

// Start clock
const double wall_time_start = madness::wall_time();
double total_time = 0.0;
double total_gflop_rate = 0.0;

// Do matrix multiplication
for (int i = 0; i < nrepeat; ++i) {
double iter_time_start = madness::wall_time();
// c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n");
c("m,n") = a("m,k") * b("k,n");
c.world().gop.fence(); // fence since GEMM can return early
double iter_time_stop = madness::wall_time();
const double iter_time = iter_time_stop - iter_time_start;
total_time += iter_time;
const double gflop_rate = double(nflops) / (iter_time * 1.e9);
total_gflop_rate += gflop_rate;
if (world.rank() == 0)
std::cout << "Iteration " << i + 1
<< " wall time: " << (iter_time_stop - iter_time_start)
std::cout << "Iteration " << i + 1 << " wall time: " << iter_time
<< "\n";
if (world.rank() == 0)
std::cout << "Iteration " << i + 1 << " time=" << time
<< " GFLOPS=" << gflop_rate << "\n";
}
// Stop clock
const double wall_time_stop = madness::wall_time();
Expand All @@ -256,32 +270,43 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
cudaProfilerStop();

if (world.rank() == 0)
std::cout << "Average wall time = "
<< (wall_time_stop - wall_time_start) / double(nrepeat)
std::cout << "Average wall time = " << total_time / double(nrepeat)
<< " sec\nAverage GFLOPS = "
<< double(nrepeat) * 2.0 * double(Nn * Nm * Nm) /
(wall_time_stop - wall_time_start) / 1.0e9
<< "\n";
<< total_gflop_rate / double(nrepeat) << "\n";
}

double threshold =
std::numeric_limits<typename Storage::value_type>::epsilon();
double threshold = std::numeric_limits<RT>::epsilon();
auto dot_length = Nk;
// auto result = dot_length * val_a * val_b + val_a - val_b;
auto result = dot_length * val_a * val_b;
T result;
if constexpr (complex_T) {
result = T(dot_length * val_a * val_b, 0.);
} else
result = dot_length * val_a * val_b;

auto verify = [&world, &threshold, &result,
&dot_length](TA::Tile<CUDATile> &tile) {
auto n_elements = tile.size();
for (std::size_t i = 0; i < n_elements; i++) {
double abs_err = fabs(tile[i] - result);
double abs_err = std::abs(tile[i] - result);
// double abs_val = fabs(tile[i]);
double rel_err = abs_err / result / dot_length;
double rel_err = abs_err / std::abs(result) / dot_length;
if (rel_err > threshold) {
auto to_string = [](const auto &v) {
constexpr bool complex_T =
TiledArray::detail::is_complex_v<std::decay_t<decltype(v)>>;
if constexpr (complex_T) {
std::string result;
result = "{" + std::to_string(v.real()) + "," +
std::to_string(v.imag()) + "}";
return result;
} else
return std::to_string(v);
};
std::cout << "Node: " << world.rank() << " Tile: " << tile.range()
<< " id: " << i
<< std::string(" gpu: " + std::to_string(tile[i]) +
" cpu: " + std::to_string(result) + "\n");
<< std::string(" gpu: " + to_string(tile[i]) +
" cpu: " + to_string(result) + "\n");
break;
}
}
Expand All @@ -308,7 +333,7 @@ int try_main(int argc, char **argv) {
"blocked by Bm, Bn, and Bk, respectively"
<< std::endl
<< "Usage: " << argv[0]
<< " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] "
<< " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
"[storage type = cuda_um_btas_varray]\n";
return 0;
}
Expand Down Expand Up @@ -337,13 +362,13 @@ int try_main(int argc, char **argv) {
return 1;
}

const auto real_type_str =
(argc >= 9) ? std::string(argv[8]) : std::string("double");

if (real_type_str != "float" && real_type_str != "double") {
std::cerr << "Error: invalid real type: " << real_type_str
<< "\n Valid option includes: float or "
"double. \n";
const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
if (scalar_type_str != "double" && scalar_type_str != "float" &&
scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
std::cerr << " valid real types are \"double\", \"float\", "
"\"zdouble\", and \"zfloat\".\n";
return 1;
}

const auto storage_type =
Expand All @@ -357,7 +382,7 @@ int try_main(int argc, char **argv) {
"cuda_um_btas_varray or cuda_um_thrust_vector "
"or cpu_cuda_vector. \n";
}
std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">"
std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
<< std::endl;
// auto to_bool = [](const std::string &str) {
// return (str == "true" || str == "True" || str == "TRUE" || str == "1" ||
Expand Down Expand Up @@ -424,7 +449,7 @@ int try_main(int argc, char **argv) {
} // print device properties

// if (storage_type == "cpu_cuda_vector") {
// if (real_type_str == "double")
// if (scalar_type_str == "double")
// do_main_body<TiledArray::cpu_cuda_vector<double>>(world, Nm, Bm, Nn,
// Bn,
// Nk, Bk, nrepeat);
Expand All @@ -434,15 +459,24 @@ int try_main(int argc, char **argv) {
// Nk, Bk, nrepeat);
// } else if (storage_type == "cuda_um_btas_varray") {
if (storage_type == "cuda_um_btas_varray") {
if (real_type_str == "double")
if (scalar_type_str == "double")
do_main_body<TiledArray::cuda_um_btas_varray<double>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else
else if (scalar_type_str == "float")
do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
Bn, Nk, Bk, nrepeat);
else if (scalar_type_str == "zdouble")
do_main_body<TiledArray::cuda_um_btas_varray<std::complex<double>>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else if (scalar_type_str == "zfloat")
do_main_body<TiledArray::cuda_um_btas_varray<std::complex<float>>>(
world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
else {
abort(); // unreachable
}
}
// else if (storage_type == "cuda_um_thrust_vector") {
// if (real_type_str == "double")
// if (scalar_type_str == "double")
// do_main_body<TiledArray::cuda_um_thrust_vector<double>>(
// world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
// else
Expand Down
8 changes: 5 additions & 3 deletions examples/demo/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

# Create the ta_fock_build executable

# Add the demo executable
# Standard TA demo to accompany the keynote slides
add_ta_executable(demo "demo.cpp" "tiledarray")
add_dependencies(examples-tiledarray demo)

# TA demo snippets for the paper
add_ta_executable(demo2 "demo2.cpp" "tiledarray")
add_dependencies(examples-tiledarray demo2)
3 changes: 1 addition & 2 deletions examples/demo/demo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) {
int main(int argc, char *argv[]) {
using namespace std;

std::srand(2017);
TA::srand(2017);
TA::World &world = TA::initialize(argc, argv);

using namespace TA;
Expand Down Expand Up @@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
SparseShape<float> shape(shape_tensor, TR);
TSpArrayD a1(world, TR, shape);
a1.fill_random(); // for deterministic fill:
// TA::srand(seed);
// a1.fill_random<HostExecutor::Thread>();
cout << "a1:\n" << a1 << endl;
world.gop.fence();
Expand Down
115 changes: 115 additions & 0 deletions examples/demo/demo2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* This file is a part of TiledArray.
* Copyright (C) 2023 Virginia Tech
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/

#ifndef EXAMPLES_DEMO_DEMO2_CPP_
#define EXAMPLES_DEMO_DEMO2_CPP_

#include <tiledarray.h>
#include <random>

#include <TiledArray/expressions/einsum.h>
#include <TiledArray/math/solvers/cp.h>

int main(int argc, char* argv[]) {
using namespace std;

TA::srand(2017);
TA::World& world = TA::initialize(argc, argv);

using namespace TA;

// requires compiler new enough to support unicode characters in variable
// names
#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC
#if __GNUC__ >= 10
#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
#endif
#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC)
#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
#endif
#endif // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)

#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES

// $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
Range ρ{{1, 11}, {-1, 9}};
// lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
assert((ρ.lobound() == Index{1, -1}));
// upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
assert((ρ.upbound() == Index{11, 9}));
// extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
assert((ρ.extent() == Index{10, 10}));
// the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
assert(ρ.volume() == 100);
// row-major order
assert((ρ.stride() == Index{10, 1}));
assert((ρ.ordinal({1, -1}) == 0));
assert((ρ.ordinal({1, 0}) == 1));
assert((ρ.ordinal({10, 8}) + 1 == ρ.volume()));
// prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] "
for (auto&& idx : ρ) cout << idx << " ";

// $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and
// $\mathbb{Z}_{8,11}$
TiledRange1 τ0{1, 5, 8, 11}; // hashmarks
assert(τ0.extent() == 10); // there are 10 elements in τ0
assert((τ0.elements_range() ==
Range1{1, 11})); // elements indexed by $\mathbb{Z}_{1,11}$
assert(τ0.tile_extent() == 3); // there are 3 tiles in τ0
assert((τ0.tiles_range() ==
Range1{0, 3})); // tiles indexed by $\mathbb{Z}_{0,3}$
assert((τ0.tile(1) == Range1{5, 8})); // 1st tile of τ0 is $\mathbb{Z}_{5,8}$

// $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$
TiledRange1 τ1{-1, 5, 9};

// 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$
assert((τ0.tile(1) == Range1{5, 8}));
// 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$
assert((τ1.tile(0) == Range1{-1, 5}));

// prints "-1 0 1 2 3 4 "
for (auto&& i : τ1.tile(0)) cout << i << " ";

// tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product
// of
// $\code{τ0}$ and $\code{τ1}$
TiledRange τ{τ0, τ1};
// shortcut
TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}};

// tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes
// \mathbb{Z}_{-1,5}$
auto tile_0_0 = τ.tile({0, 0});
assert((tile_0_0 == Range{{1, 5}, {-1, 5}}));

// default instance of $\code{DistArray}$ is a dense array of $\code{double}$s
DistArray array0(world, τ);
array0.fill(1.0); // fill $\code{array0}$ with 1's

// grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2.
auto t00 = array0.find({0, 0});

#endif // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)

return 0;
}

#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */
Loading