In [1]:
using CUDA, LinearAlgebra, CUDA.CUSPARSE, CUDA.CUBLAS, SparseArrays, BenchmarkTools

In [2]:
function proj_CPU(p₀, u, β)
    return  p₀ .- ((dot(u, p₀)- β)/dot(u, u)).*u
end

proj_CPU (generic function with 1 method)

In [3]:
n = Int32(2^20)
X = CUDA.rand(n)
Y = CUDA.rand(n)
x = Array(X)
y = Array(Y)
β = Float32(1.0)


1.0f0

In [4]:
@benchmark proj_CPU(x, y, β) #chama a função com vetores na CPU

BenchmarkTools.Trial: 3768 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.092 ms[22m[39m … [35m  3.270 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 27.78%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.113 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m1.312 ms[22m[39m ± [32m513.154 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m7.14% ± 11.89%

  [39m█[34m▅[39m[39m▂[39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▁[39m [39m▂[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m [39m 
  [39m█[34m█[39m[39m█[39m▇[39m▄[3

In [5]:
@benchmark proj_CPU(X,Y, β) #Chama a função com vetores na GPU
#Neste caso não vale a pena chamar o CUBLAS.dot() pois algumas funções de algebra inear são mais rápidas como o dot


BenchmarkTools.Trial: 10000 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m334.490 μs[22m[39m … [35m  8.941 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m393.567 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m428.869 μs[22m[39m ± [32m480.226 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.79% ± 3.05%

  [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▅[39m▇[39m█[34m█[39m[39m▆[39m▁[39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▂
  [39m▃[39m▁[39m▁[3

In [6]:
#using Cthulhu #Este pacote ajuda a entender os erros nas funções em GPU
#@device_code_warntype interactive=true @cuda proj_GPU(X,Y, β, n)

In [7]:
#CUDA.reclaim() #LIMPA A MEMORIA DA GPU
#CUDA.memory_status()  #DIZ A QUANTIDADE DE MEMÓRIA LIVRE NA GPU

In [8]:
function reflexao(p₀, u, β)
    return  2 .*proj_CPU(p₀, u, β) .- p₀
end

reflexao (generic function with 1 method)

In [9]:
@benchmark reflexao(x, y, β) #chama a função com vetores na CPU

BenchmarkTools.Trial: 1982 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m1.885 ms[22m[39m … [35m  5.324 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m 0.00% … 25.13%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m1.941 ms               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m 0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m2.503 ms[22m[39m ± [32m939.398 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m11.13% ± 15.45%

  [39m█[34m▇[39m[39m▄[39m▁[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [32m [39m[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▃[39m▃[39m [39m [39m [39m [39m [39m [39m▃[39m▂[39m▁[39m [39m [39m [39m [39m [39m▁[39m▄[39m▂[39m [39m 
  [39m█[34m█[39m[39m█[39m█[39m▇

In [10]:
@benchmark reflexao(X,Y, β) #Chama a função com vetores na GPU

BenchmarkTools.Trial: 7812 samples with 1 evaluation.
 Range [90m([39m[36m[1mmin[22m[39m … [35mmax[39m[90m):  [39m[36m[1m347.705 μs[22m[39m … [35m  9.101 ms[39m  [90m┊[39m GC [90m([39mmin … max[90m): [39m0.00% … 0.00%
 Time  [90m([39m[34m[1mmedian[22m[39m[90m):     [39m[34m[1m573.795 μs               [22m[39m[90m┊[39m GC [90m([39mmedian[90m):    [39m0.00%
 Time  [90m([39m[32m[1mmean[22m[39m ± [32mσ[39m[90m):   [39m[32m[1m637.487 μs[22m[39m ± [32m672.807 μs[39m  [90m┊[39m GC [90m([39mmean ± σ[90m):  [39m0.98% ± 3.82%

  [39m [39m [39m [39m [39m▃[39m▂[39m [39m [39m▇[34m█[39m[39m [39m [32m [39m[39m▄[39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m [39m▁
  [39m▃[39m▁[39m▇[39

In [11]:
function reflexao_simultanea(xₖ, A, b, n, r)
    rₖ = CUDA.zeros(r)
      for i=1:n
         rₖ .+= reflexao(xₖ, A[i,:], b[i])
    end
    return rₖ./n
end

reflexao_simultanea (generic function with 1 method)

In [12]:
#rₖ = Vector{Float32}(undef, 1_000) #Aloca uma memória para um vetor de tamanho 1_000 na GPU

In [13]:
#CuArray{Int}(undef, 2) - cria um array em pé de 2 entradas

In [14]:
#CuArray{Int}(undef, (1,2))- cria um array deitado de 2 entradas

In [15]:
#fill!(rₖ, 0.) #Atribui a memória alocada um vetor cujas etradas são todas nulas

In [16]:
#@sync - pausa as tarefas da CPU até as tarefas da GPU dentro do Bloco serem concluídas

In [17]:
#@btime nome da função - mede o tempo como benchmarktools

In [18]:
#@cuprintln("thread $index, block $stride") - imprime

In [19]:
#synchronize() - sincroniza a GPU, necessário usar com o @cuprint()

In [20]:
#broadcast - Faz operções com elementos que não possuem a mesma dimensão como somar um vetor 
#nas colunas de uma matriz por exemplo, com strings faz concatenação

In [21]:
#map(f, c) -> coleção -Transformar a colecção c através da aplicação de f a cada elemento. Para múltiplos
#argumentos de recolha, aplicar f elemento a elemento. Ex: aplica uma função nas entradas de um vetor,
#opera com vetores de mesmo tamanho.

In [22]:
#a = reshape(Vector(1:16), (4,4)) # cria o vetor de 1 até 16 depois transforma numa matriz 4x4 por colunas

#reduce(max, a, dims=2) -verifica qual o vetor de maior valores na matriz e devolve somente este vetor 
#em forma de matriz em pé

#reduce(max, a, dims=1)- verifica qual o vetor de maior valores na matriz e devolve somente este vetor 
#em forma de matriz deitada

In [23]:
#reduce(*, [2; 3; 4]) devolve a multiplicação das entradas do vetor considerando o elemento neutro da 
#multiplicação como sendo 1

#reduce(*, [2; 3; 4]; init=-1)devolve a multiplicação das entradas do vetor considerando o elemento neutro da 
#multiplicação como sendo -1

In [24]:
#tamanho = length(a)/1024 - em que 1024 é o numero de threads
#@cuda threads=length(a)/tamanho função(a)  - Faz a divisão para as threads

In [25]:
#a = CuArray([1,2]) - array na GPU
#b = Array(a) - array na CPU
#copyto!(b, a) - aloca b na GPU no lugar de a

In [26]:
#SparseArrays em cuda
A1 = sprand(10,10,0.2) #cria matriz espasa 10x10 com esparcidade 0.8 e distribuição normal

10×10 SparseMatrixCSC{Float64, Int64} with 17 stored entries:
  ⋅         ⋅          ⋅         ⋅   …   ⋅       0.98923   ⋅        ⋅ 
  ⋅         ⋅         0.853237   ⋅       ⋅        ⋅        ⋅        ⋅ 
  ⋅         ⋅          ⋅         ⋅       ⋅        ⋅       0.23843   ⋅ 
  ⋅        0.615586    ⋅         ⋅       ⋅        ⋅        ⋅        ⋅ 
 0.942506  0.0956749  0.903835   ⋅       ⋅        ⋅        ⋅        ⋅ 
  ⋅        0.812322    ⋅         ⋅   …   ⋅        ⋅        ⋅        ⋅ 
  ⋅         ⋅          ⋅         ⋅      0.95418   ⋅        ⋅        ⋅ 
  ⋅         ⋅          ⋅         ⋅       ⋅        ⋅        ⋅        ⋅ 
  ⋅         ⋅          ⋅         ⋅       ⋅        ⋅        ⋅        ⋅ 
  ⋅        0.693423   0.113387   ⋅       ⋅        ⋅        ⋅       0.743462

In [27]:
x1 = sprand(10,0.2) #cria vetor esparço 10x1 com distribuição normal e esparsidade 0.8

10-element SparseVector{Float64, Int64} with 2 stored entries:
  [2 ]  =  0.76296
  [4 ]  =  0.61587

In [28]:
 x(x)

LoadError: MethodError: objects of type Vector{Float32} are not callable
Use square brackets [] for indexing an Array.