In [1]:
using LinearAlgebra
using BenchmarkTools
using Random

BLAS.set_num_threads(8)

# Views (on storage) is calling BLAS!!

In [2]:
Random.seed!(2020)
A = rand(5000, 5000)
B = rand(5000, 5000)
C = zeros(5001, 5001)
D = zeros(5000, 5000);

In [3]:
# check correctness
mul!(D, A, B)
mul!(view(C, 1:5000, 1:5000), A, B)
all(view(C, 1:5000, 1:5000) .== D)

true

In [4]:
@btime mul!($D, $A, $B);

  856.525 ms (0 allocations: 0 bytes)


In [5]:
@btime mul!(E, $A, $B) setup=(E = view(C, 1:5000, 1:5000));

  877.694 ms (0 allocations: 0 bytes)


In [6]:
@btime BLAS.gemm!('N', 'N', 1.0, $A, $B, 0.0, $D);

  869.369 ms (0 allocations: 0 bytes)


In [7]:
@btime BLAS.gemm!('N', 'N', 1.0, $A, $B, 0.0, E) setup=(E = view(C, 1:5000, 1:5000));

  926.243 ms (0 allocations: 0 bytes)


# Viewing small chunk of big matrix also works

In [8]:
Random.seed!(2020)
A = rand(1000, 1000)
B = rand(1000, 1000)
C = zeros(10000, 10000)
D = zeros(1000, 1000);

In [9]:
# check correctness
mul!(D, A, B)
mul!(view(C, 1:1000, 1:1000), A, B)
all(view(C, 1:1000, 1:1000) .== D)

true

In [10]:
@btime mul!($D, $A, $B);

  5.979 ms (0 allocations: 0 bytes)


In [11]:
@btime mul!(E, $A, $B) setup=(E = view(C, 1:1000, 1:1000));

  6.782 ms (0 allocations: 0 bytes)


# Views on 2 or more matrices also work

In [2]:
Random.seed!(2020)
A = rand(1000, 1000)
B = rand(1000, 1000)
C = zeros(1000, 1000)

Av = rand(10000, 10000)
Cv = rand(10000, 10000);

In [3]:
@btime mul!($C, $A, $B);

  6.176 ms (0 allocations: 0 bytes)


In [5]:
@btime mul!(E, F, $B) setup=(E=view(Cv, 1:1000, 1:1000);F=view(Av, 1:1000, 1:1000));

  6.247 ms (0 allocations: 0 bytes)


# Nested views is fine too

In [3]:
Random.seed!(2020)
A = rand(1000, 1000)
B = rand(1000, 1000)
C = zeros(1002, 1002)
D = zeros(1000, 1000);

In [4]:
@btime mul!($D, $A, $B);

  5.902 ms (0 allocations: 0 bytes)


In [5]:
Cv = view(C, 1:1001, 1:1001)
@btime mul!(E, $A, $B) setup=(E=view(Cv, 1:1000, 1:1000));

  6.395 ms (0 allocations: 0 bytes)
