## sigmoid on GPU

In [5]:
function sigmoid_kernel(a)
    ix = (blockIdx().x-1) * blockDim().x + threadIdx().x
    iy = (blockIdx().y-1) * blockDim().y + threadIdx().y
#     @cuprintln("ix $ix, iy $iy")
    a[ix, iy] = 1 / (1 + CuArrays.exp(-a[ix, iy]))
    return nothing
end

sigmoid_kernel (generic function with 1 method)

In [6]:
n = 10
m = 20
a_orig = Array{Float32}(rand(m, n))
a = CuArray(a_orig)
CuArrays.@sync @cuda threads=(m, n) sigmoid_kernel(a)

└ @ CUDAnative C:\Users\SarnayakPC\.julia\packages\CUDAnative\hfulr\src\compiler\irgen.jl:111


In [7]:
a

20×10 CuArray{Float32,2,Nothing}:
 0.577335  0.611561  0.61671   0.504644  …  0.553029  0.536169  0.664532
 0.66917   0.502518  0.728631  0.695894     0.685867  0.692108  0.531931
 0.548272  0.721593  0.71833   0.627631     0.686035  0.633802  0.538681
 0.727454  0.662204  0.516642  0.727517     0.534225  0.625324  0.533274
 0.724289  0.698561  0.583965  0.71587      0.564436  0.676977  0.631454
 0.575716  0.609302  0.587959  0.573978  …  0.588043  0.510213  0.6104  
 0.555853  0.59587   0.581312  0.508104     0.608823  0.503202  0.636837
 0.559386  0.524754  0.701984  0.588567     0.526298  0.507381  0.52118 
 0.547387  0.701003  0.526006  0.589232     0.550075  0.565935  0.616425
 0.689187  0.656939  0.655967  0.53345      0.598436  0.588492  0.723944
 0.562735  0.595525  0.548314  0.517616  …  0.535533  0.603272  0.591052
 0.723045  0.602947  0.563794  0.527265     0.596384  0.635612  0.571881
 0.522088  0.64529   0.708212  0.726714     0.579691  0.715405  0.520773
 0.668336  0.5019

In [8]:
1 / (1 + exp(a_orig[1, 1]))

0.4226654f0

In [9]:
a_orig

20×10 Array{Float32,2}:
 0.311841   0.453877    0.475606   …  0.212916    0.144927   0.683558 
 0.704434   0.0100735   0.987689      0.780866    0.809994   0.127896 
 0.193692   0.952376    0.936195      0.781649    0.548561   0.155035 
 0.981744   0.673133    0.0665921     0.137115    0.512207   0.133293 
 0.965836   0.840457    0.339072      0.259184    0.739915   0.53846  
 0.30521    0.444379    0.355535   …  0.355879    0.0408578  0.448994 
 0.22435    0.388286    0.328162      0.442367    0.0128068  0.561662 
 0.23867    0.0990982   0.856764      0.10529     0.0295262  0.0847726
 0.190118   0.852079    0.10412       0.200975    0.265283   0.474403 
 0.796323   0.649684    0.645372      0.398951    0.357735   0.964112 
 0.252268   0.386853    0.193862   …  0.142373    0.419117   0.368315 
 0.959616   0.417758    0.256573      0.39042     0.556368   0.289528 
 0.0884109  0.5984      0.886714      0.321506    0.921782   0.0831406
 0.700668   0.00768358  0.0643719     0.630926    0.6

In [10]:
σ(x) = 1 / (1 + exp(-x))
σ.(a_orig)

20×10 Array{Float32,2}:
 0.577335  0.611561  0.61671   0.504644  …  0.553029  0.536169  0.664532
 0.66917   0.502518  0.728631  0.695894     0.685867  0.692108  0.531931
 0.548272  0.721593  0.71833   0.627631     0.686035  0.633802  0.538681
 0.727454  0.662204  0.516642  0.727517     0.534225  0.625324  0.533274
 0.724289  0.698561  0.583965  0.71587      0.564436  0.676977  0.631454
 0.575716  0.609302  0.587959  0.573978  …  0.588043  0.510213  0.6104  
 0.555853  0.59587   0.581312  0.508104     0.608823  0.503202  0.636837
 0.559386  0.524754  0.701984  0.588567     0.526298  0.507381  0.52118 
 0.547387  0.701003  0.526006  0.589232     0.550075  0.565935  0.616425
 0.689187  0.656939  0.655967  0.53345      0.598436  0.588492  0.723944
 0.562735  0.595525  0.548314  0.517616  …  0.535533  0.603272  0.591052
 0.723045  0.602947  0.563794  0.527265     0.596384  0.635612  0.571881
 0.522088  0.64529   0.708212  0.726714     0.579691  0.715405  0.520773
 0.668336  0.501921  0.5160

In [13]:
@test a == sigmoid.(a_orig)

└ @ GPUArrays C:\Users\SarnayakPC\.julia\packages\GPUArrays\1wgPO\src\indexing.jl:16


[32m[1mTest Passed[22m[39m

In [22]:
n = 10
m = 20
a_orig = Array{Float32}(rand(m, n))
a = CuArray(a_orig)
function sigmoid_kernel_bench(a)
    CuArrays.@sync @cuda threads=(m, n) sigmoid_kernel(a)
end

sigmoid_kernel_bench (generic function with 1 method)

In [23]:
@btime sigmoid_kernel_bench($a)

  40.099 μs (36 allocations: 1.02 KiB)


In [26]:
a_orig = Array{Float32}(rand(m, n))
a = CuArray(a_orig)

20×10 CuArray{Float32,2,Nothing}:
 0.451252  0.694854   0.347139   …  0.242751   0.983149   0.487018 
 0.419234  0.601714   0.54183       0.376485   0.171601   0.548751 
 0.483025  0.161965   0.12343       0.0503669  0.883799   0.632338 
 0.749187  0.0560541  0.428402      0.466877   0.334011   0.0312829
 0.120904  0.709205   0.789291      0.242255   0.409874   0.611941 
 0.893695  0.557556   0.0934925  …  0.824731   0.249475   0.0055082
 0.843193  0.131229   0.207676      0.966639   0.392722   0.115598 
 0.101835  0.598037   0.693269      0.761445   0.273201   0.755158 
 0.751037  0.554508   0.770957      0.632452   0.604236   0.468061 
 0.915562  0.944139   0.617021      0.133088   0.323256   0.258699 
 0.362407  0.905768   0.887267   …  0.356134   0.755054   0.35554  
 0.198362  0.401095   0.159584      0.916552   0.908547   0.835168 
 0.476702  0.717044   0.718182      0.902185   0.0981334  0.440905 
 0.552425  0.205861   0.550955      0.895428   0.917851   0.575483 
 0.272971  0.8

In [25]:
@btime sigmoid.($a)

  7.567 μs (66 allocations: 2.45 KiB)


20×10 CuArray{Float32,2,Nothing}:
 0.531027  0.686346  0.630196  0.718108  …  0.602327  0.633657  0.510283
 0.66417   0.533093  0.725353  0.55908      0.681128  0.627392  0.623829
 0.632699  0.710772  0.66465   0.657485     0.685617  0.538802  0.546992
 0.709101  0.578238  0.61455   0.611689     0.701638  0.645052  0.513292
 0.595642  0.689796  0.625495  0.593311     0.540178  0.676415  0.686023
 0.61614   0.641338  0.515895  0.569663  …  0.71107   0.673949  0.500933
 0.726677  0.610259  0.669671  0.711399     0.621569  0.554837  0.670262
 0.626354  0.534322  0.577365  0.642854     0.596658  0.723433  0.51527 
 0.673336  0.729208  0.600502  0.556382     0.713148  0.641887  0.714463
 0.679888  0.512374  0.605984  0.607683     0.519441  0.660035  0.624494
 0.691909  0.569034  0.570159  0.619744  …  0.702552  0.575545  0.675735
 0.535573  0.630249  0.630825  0.666385     0.624029  0.723916  0.541694
 0.678649  0.69659   0.588314  0.720815     0.637203  0.566784  0.596924
 0.565039  0.5530

In [34]:
a_orig = Array{Float32}(rand(m, n))
a = CuArray(a_orig)
@btime relu.($a)

  7.633 μs (64 allocations: 2.41 KiB)


20×10 CuArray{Float32,2,Nothing}:
 0.379313   0.173979   0.592108    …  0.317937   0.793038   0.708511 
 0.413172   0.424013   0.338778       0.795655   0.146667   0.643705 
 0.936054   0.275679   0.9076         0.822408   0.830122   0.504997 
 0.996698   0.77327    0.327726       0.870411   0.571522   0.49797  
 0.279842   0.790206   0.865146       0.67974    0.97151    0.0970397
 0.638858   0.869738   0.0882066   …  0.605294   0.534533   0.359098 
 0.758577   0.238537   0.0566972      0.732929   0.669155   0.197593 
 0.58328    0.853425   0.446947       0.300386   0.955105   0.234626 
 0.0787462  0.395602   0.682974       0.442433   0.834596   0.656271 
 0.735329   0.0115308  0.618093       0.786229   0.551193   0.532615 
 0.330023   0.376273   0.00508564  …  0.600083   0.0414349  0.760851 
 0.31874    0.183207   0.106          0.06639    0.786127   0.815907 
 0.369561   0.558079   0.571732       0.0897157  0.414474   0.603149 
 0.607397   0.817073   0.760501       0.417553   0.87416

In [35]:
relu_2(x) = x>0 ? x : 0

relu_2 (generic function with 1 method)

In [36]:
a_orig = Array{Float32}(rand(m, n))
a = CuArray(a_orig)
@btime relu_2.($a)

  10.212 ms (1215 allocations: 50.41 KiB)


20×10 CuArray{Float32,2,Nothing}:
 0.0344329  0.509736   0.196674   …  0.523784   0.985308   0.469104
 0.429339   0.754018   0.579372      0.554148   0.950919   0.290882
 0.731354   0.225431   0.794155      0.74394    0.0262328  0.225765
 0.404615   0.472202   0.588401      0.0635131  0.10272    0.361871
 0.406868   0.487186   0.162648      0.198437   0.40577    0.271235
 0.842463   0.728695   0.763687   …  0.301082   0.0505012  0.390648
 0.471216   0.247592   0.587221      0.977824   0.200717   0.766334
 0.380655   0.0168906  0.337841      0.547154   0.986558   0.674468
 0.830283   0.753895   0.256911      0.405306   0.420468   0.700182
 0.25447    0.557292   0.479535      0.48189    0.779671   0.656069
 0.540372   0.0619335  0.384973   …  0.838241   0.740944   0.494139
 0.0820777  0.230617   0.343929      0.711321   0.371431   0.593207
 0.864271   0.983504   0.480843      0.429459   0.922232   0.799007
 0.112808   0.431099   0.609578      0.905895   0.0604937  0.285854
 0.538479   0.

In [38]:
@test relu.(a) == relu_2.(a)

[32m[1mTest Passed[22m[39m

In [39]:
### Why is ternary operator so much slower in GPU?