In [1]:
%matplotlib inline
from __future__ import print_function

import pycuda.autoinit
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import numpy as np
import time

import skcuda.linalg as culinalg
cudot=culinalg.dot
to_gpu=gpuarray.to_gpu
import skcuda.misc as cumisc
culinalg.init()

Can't load cusolver --ds


In [None]:
# Double precision is only supported by devices with compute
# capability >= 1.3:
import string
demo_types = [np.float32, np.complex64]
if cumisc.get_compute_capability(pycuda.autoinit.device) >= 1.3:
    demo_types.extend([np.float64, np.complex128])

for t in demo_types:
    print('Testing matrix multiplication for type ' + str(np.dtype(t)))
    if np.iscomplexobj(t()):
        a = np.asarray(np.random.rand(10, 5) + 1j * np.random.rand(10, 5), t)
        b = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t)
        c = np.asarray(np.random.rand(5, 5) + 1j * np.random.rand(5, 5), t)
    else:
        a = np.asarray(np.random.rand(10, 5), t)
        b = np.asarray(np.random.rand(5, 5), t)
        c = np.asarray(np.random.rand(5, 5), t)

    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    c_gpu = gpuarray.to_gpu(c)

    temp_gpu = culinalg.dot(a_gpu, b_gpu)
    d_gpu = culinalg.dot(temp_gpu, c_gpu)
    temp_gpu.gpudata.free()
    del(temp_gpu)
    print('Success status: ', np.allclose(np.dot(np.dot(a, b), c), d_gpu.get()))

    print('Testing vector multiplication for type ' + str(np.dtype(t)))
    if np.iscomplexobj(t()):
        d = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t)
        e = np.asarray(np.random.rand(5) + 1j * np.random.rand(5), t)
    else:
        d = np.asarray(np.random.rand(5), t)
        e = np.asarray(np.random.rand(5), t)

    d_gpu = gpuarray.to_gpu(d)
    e_gpu = gpuarray.to_gpu(e)

    temp = culinalg.dot(d_gpu, e_gpu)
    print('Success status: ', np.allclose(np.dot(d, e), temp))

In [3]:

m_sizes = [10,20,40,80,150,300,600,1200,2500,5000,10000,15000]

ctimes=[]
gtimes=[]

for m_size in m_sizes:
    smem= drv.mem_get_info()
    a= np.asarray(np.random.rand(m_size,m_size) + 1j * np.random.rand(m_size,m_size), np.complex64)
    b= np.asarray(np.random.rand(m_size,m_size) + 1j * np.random.rand(m_size,m_size), np.complex64)

    gst=time.time()
    ga=to_gpu(a)
    gb=to_gpu(b)
    tg=cudot(ga,gb)
    gft=time.time()
    cst=time.time()
    m=np.dot(a,b)
    cft=time.time()
    fmem=drv.mem_get_info()
    tg.gpudata.free()
    ga.gpudata.free()
    gb.gpudata.free()
    
    ctimes.append(cft-cst)
    gtimes.append(gft-gst)
    
    print ("size: %d, gpu: %f, cpu: %f, ratio : %f smem: %0.2f GB fmem: %0.2f GB" % (m_size,gtimes[-1],ctimes[-1],ctimes[-1]/gtimes[-1],(smem[1]-smem[0])/1e9,(fmem[1]-fmem[0])/1e9))

size: 10, gpu: 0.000899, cpu: 0.005532, ratio : 6.153010 smem: 0.15 GB fmem: 0.15 GB
size: 20, gpu: 0.001024, cpu: 0.000025, ratio : 0.024447 smem: 0.15 GB fmem: 0.15 GB
size: 40, gpu: 0.000867, cpu: 0.000043, ratio : 0.049780 smem: 0.15 GB fmem: 0.15 GB
size: 80, gpu: 0.000520, cpu: 0.005551, ratio : 10.675378 smem: 0.15 GB fmem: 0.15 GB
size: 150, gpu: 0.001317, cpu: 0.001439, ratio : 1.092505 smem: 0.15 GB fmem: 0.15 GB
size: 300, gpu: 0.002020, cpu: 0.003700, ratio : 1.831583 smem: 0.15 GB fmem: 0.16 GB
size: 600, gpu: 0.002834, cpu: 0.012944, ratio : 4.567258 smem: 0.15 GB fmem: 0.16 GB
size: 1200, gpu: 0.004176, cpu: 0.095504, ratio : 22.868977 smem: 0.15 GB fmem: 0.19 GB
size: 2500, gpu: 0.011887, cpu: 0.443664, ratio : 37.323238 smem: 0.15 GB fmem: 0.30 GB
size: 5000, gpu: 0.036994, cpu: 1.910062, ratio : 51.631693 smem: 0.15 GB fmem: 0.76 GB
size: 10000, gpu: 0.142716, cpu: 14.674701, ratio : 102.824375 smem: 0.15 GB fmem: 2.56 GB
size: 15000, gpu: 0.319771, cpu: 47.593949, ra

In [None]:
m_size=100
a= np.asarray(np.random.rand(m_size,m_size) + 1j * np.random.rand(m_size,m_size), np.complex64)
b= np.asarray(np.random.rand(m_size,m_size) + 1j * np.random.rand(m_size,m_size), np.complex64)
ga=to_gpu(a)
gb=to_gpu(b)
gc=cudot(ga,gb)

In [None]:
np.dot(gc.get(),gc.get())

In [None]:
2*8*30000**2/1e9

In [None]:
len(gc)

In [None]:
np.log2(100000)

In [None]:
def gpu_free(*gpu_vs):
    for gpu_v in gpu_vs:
        print (gpu_v)


In [None]:
gpu_free('a','b','c')

In [None]:
gc.shape

In [2]:
15000**3/0.3/1e12

11.25