Matrix multiplication

In [None]:
# clean the memory
%reset
!pip install pycuda

In [None]:
import numpy as np
from pycuda import gpuarray, autoinit
import pycuda.driver as cuda
from pycuda.tools import DeviceData
from pycuda.tools import OccupancyRecord as occupancy

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt

In [None]:
cuBlock = (32,32,1)
cuGrid = (16,16,1)

In [None]:
cudaKernel2 = '''
__global__ void matrixMul(float *A, float *B, float *C)
{
    int tid_x = blockDim.x * blockIdx.x + threadIdx.x; // Row
    int tid_y = blockDim.y * blockIdx.y + threadIdx.y; // Column
    int matrixDim = gridDim.x * blockDim.x;
    int tid   = matrixDim * tid_y + tid_x; // element i,j

    float  aux=0.0f;

    for ( int i=0 ; i<matrixDim ; i++ ){
        //
        aux += A[matrixDim * tid_y + i]*B[matrixDim * i + tid_x] ;

    }

    C[tid] = aux;

}
'''

In [None]:
from pycuda.compiler import SourceModule
myCode = SourceModule(cudaKernel2)
mulMatrix = myCode.get_function("matrixMul")

In [None]:
presCPU, presGPU = np.float32, 'float'
#presCPU, presGPU = np.float64, 'double'
a_cpu = np.random.random((512,512)).astype(presCPU)
b_cpu = np.random.random((512,512)).astype(presCPU)
c_cpu = np.zeros((512,512), dtype=presCPU)

In [None]:
# Array on GPU
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

In [None]:
mulMatrix(a_gpu,b_gpu,c_gpu,block=cuBlock,grid=cuGrid)
dotAB = np.dot(a_cpu, b_cpu)

In [None]:
diff = np.abs(c_gpu.get()-dotAB)
np.sum(diff)

In [None]:
plt.imshow(diff,interpolation='none')
plt.colorbar()

In [None]:
dotAB

In [None]:
c_gpu

In [None]:
presCPU, presGPU = np.float64, 'double'
a_cpu = np.random.random((512,512)).astype(presCPU)
b_cpu = np.random.random((512,512)).astype(presCPU)
c_cpu = np.zeros((512,512), dtype=presCPU)

In [None]:
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.to_gpu(c_cpu)

In [None]:
a_cpu.dtype

In [None]:
cudaKernel3 = '''
__global__ void matrixMul64(double *A, double *B, double *C)
{
    int tid_x = blockDim.x * blockIdx.x + threadIdx.x; // Row
    int tid_y = blockDim.y * blockIdx.y + threadIdx.y; // Column
    int matrixDim = gridDim.x * blockDim.x;
    int tid   = matrixDim * tid_y + tid_x; // element i,j

    double aux = 0.0;
    for ( int i=0 ; i<matrixDim ; i++ ){
        //
        aux += A[matrixDim * tid_y + i]*B[matrixDim * i + tid_x] ;

    }

    C[tid] = aux;

}
'''

In [None]:
myCode64 = SourceModule(cudaKernel3)
mulMatrix64 = myCode64.get_function("matrixMul64")

In [None]:
mulMatrix64(a_gpu,b_gpu,c_gpu,block=cuBlock,grid=cuGrid)
dotAB = np.dot(a_cpu, b_cpu)

In [None]:
c_gpu.dtype

In [None]:
dotAB.dtype

In [None]:
diff = np.abs(c_gpu.get()-dotAB)

In [None]:
plt.imshow(diff,interpolation='none')
plt.colorbar()