Skip to content

Commit

Permalink
Merge pull request #15 from Quantco/benchmark
Browse files Browse the repository at this point in the history
Performance metrics
  • Loading branch information
ElizabethSantorellaQC committed Jul 22, 2020
2 parents 2e8504a + a41c4c0 commit 33eaf43
Show file tree
Hide file tree
Showing 10 changed files with 403 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Project-specific
benchmark_output/
benchmark/*.csv

# Files created by templating
dense.cpp
Expand Down
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,25 @@ for k in range(n_rows):
res[X.indices[k], j] += d[k] * Y[k, j]
```
This is `ext/split/sandwich_cat_dense`


## Performance
Dense matrix, 100k x 1k:

![dense_bm](benchmark/dense_times.png)

One-hot encoded categorical variable, 1M x 100k:

![cat_bm](benchmark/one_cat_times.png)

Sparse matrix, 1M x 1k:

![sparse_bm](benchmark/sparse_times.png)

Two categorical matrices, 1M x 2k:

![two_cat_bm](benchmark/two_cat_times.png)

Two categorical matrices plus a dense matrix, 1M x 2k+:

![two_cat_plus_dense_bm](benchmark/dense_cat_times.png)
Binary file added benchmark/dense_cat_times.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added benchmark/dense_times.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added benchmark/one_cat_times.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
165 changes: 165 additions & 0 deletions benchmark/run_benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import time
import tracemalloc
from typing import Union

import numpy as np
import pandas as pd
from scipy import sparse as sps

import quantcore.matrix as mx


def track_peak_mem(f):
def g(*args, **kwargs):
tracemalloc.start()
f(*args, **kwargs)
_, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
return peak

return g


@track_peak_mem
def sandwich(mat: Union[mx.MatrixBase, np.ndarray, sps.csc_matrix], vec: np.ndarray):
if isinstance(mat, mx.MatrixBase):
mat.sandwich(vec)
elif isinstance(mat, np.ndarray):
(mat * vec[:, None]).T @ mat
else:
mat.T @ sps.diags(vec) @ mat
return


@track_peak_mem
def transpose_dot(
mat: Union[mx.MatrixBase, np.ndarray, sps.csc_matrix], vec: np.ndarray
):
if isinstance(mat, mx.MatrixBase):
return mat.transpose_dot(vec)
return mat.T @ vec


@track_peak_mem
def dot(mat, vec):
return mat.dot(vec)


def run_benchmarks(matrices: dict) -> pd.DataFrame:
assert isinstance(matrices, dict)
vec = np.random.random(next(iter(matrices.values())).shape[1])
vec2 = np.random.random(next(iter(matrices.values())).shape[0])

times = pd.DataFrame(
index=pd.MultiIndex.from_product(
[["matrix-vector", "sandwich", "matrix-transpose-vector"], matrices.keys()],
names=["operation", "storage"],
),
columns=["memory", "time"],
).reset_index()

for i, row in times.iterrows():
mat_ = matrices[row["storage"]]
op = row["operation"]
start = time.time()
if op == "matrix-vector":
peak_mem = dot(mat_, vec)
elif op == "matrix-transpose-vector":
peak_mem = transpose_dot(mat_, vec2)
else:
peak_mem = sandwich(mat_, vec2)

end = time.time()
times["time"].iloc[i] = end - start
times["memory"].iloc[i] = peak_mem
return times


def make_dense_matrices(n_rows: int, n_cols: int) -> dict:
dense_matrices = {"numpy_C": np.random.random((n_rows, n_cols))}
dense_matrices["numpy_F"] = dense_matrices["numpy_C"].copy(order="F")
assert dense_matrices["numpy_F"].flags["F_CONTIGUOUS"]
dense_matrices["quantcore.matrix"] = mx.DenseMatrix(dense_matrices["numpy_C"])
return dense_matrices


def make_cat_matrix(n_rows: int, n_cats: int) -> mx.CategoricalMatrix:
mat = mx.CategoricalMatrix(np.random.choice(np.arange(n_cats, dtype=int), n_rows))
return mat


def make_cat_matrix_all_formats(n_rows: int, n_cats: int) -> dict:
mat = make_cat_matrix(n_rows, n_cats)
d = {
"quantcore.matrix": mat,
"scipy.sparse csr": mat.tocsr(),
}
d["scipy.sparse csc"] = d["scipy.sparse csr"].tocsc()
return d


def make_cat_matrices(n_rows: int, n_cat_cols_1: int, n_cat_cols_2: int) -> dict:
two_cat_matrices = {
"quantcore.matrix": mx.SplitMatrix(
[
make_cat_matrix(n_rows, n_cat_cols_1),
make_cat_matrix(n_rows, n_cat_cols_2),
]
)
}
two_cat_matrices["scipy.sparse csr"] = sps.hstack(
[elt.tocsr() for elt in two_cat_matrices["quantcore.matrix"].matrices]
)
two_cat_matrices["scipy.sparse csc"] = two_cat_matrices["scipy.sparse csr"].tocsc()
return two_cat_matrices


def make_dense_cat_matrices(
n_rows: int, n_dense_cols: int, n_cats_1: int, n_cats_2: int
) -> dict:

dense_block = np.random.random((n_rows, n_dense_cols))
two_cat_matrices = [
make_cat_matrix(n_rows, n_cats_1),
make_cat_matrix(n_rows, n_cats_2),
]
dense_cat_matrices = {
"quantcore.matrix": mx.SplitMatrix(
two_cat_matrices + [mx.DenseMatrix(dense_block)]
),
"scipy.sparse csr": sps.hstack(
[elt.tocsr() for elt in two_cat_matrices] + [sps.csr_matrix(dense_block)]
),
}
dense_cat_matrices["scipy.sparse csc"] = dense_cat_matrices[
"scipy.sparse csr"
].tocsc()
return dense_cat_matrices


def make_sparse_matrices(n_rows: int, n_cols: int) -> dict:
mat = sps.random(n_rows, n_cols).tocsc()
matrices = {
"scipy.sparse csc": mat,
"scipy.sparse csr": mat.tocsr(),
"quantcore.matrix": mx.SparseMatrix(mat),
}
return matrices


def main():
n_rows = int(1e6)
benchmark_matrices = {
"dense": lambda: make_dense_matrices(int(1e5), 1000),
"one_cat": lambda: make_cat_matrix_all_formats(n_rows, int(1e5)),
"sparse": lambda: make_sparse_matrices(n_rows, int(1e3)),
"two_cat": lambda: make_cat_matrices(n_rows, int(1e3), int(1e3)),
"dense_cat": lambda: make_dense_cat_matrices(n_rows, 5, int(1e3), int(1e3)),
}
for name, f in benchmark_matrices.items():
times = run_benchmarks(f())
times.to_csv(f"benchmark/{name}_times.csv", index=False)


if __name__ == "__main__":
main()
Binary file added benchmark/sparse_times.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added benchmark/two_cat_times.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
213 changes: 213 additions & 0 deletions benchmark/visualize_benchmarks.ipynb

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ dependencies:
- setuptools_scm
- sparse_dot_mkl>=0.4.0
- xsimd
- matplotlib
- seaborn

0 comments on commit 33eaf43

Please sign in to comment.