In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-_fbsev6r
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-_fbsev6r
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 0a71d56e5dce3ff1f0dd2c47c29367629262f527
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [17]:
%%cu
#include <iostream>
#include <chrono>
#include <cstdlib>
using namespace std::chrono;
using std::cout;
using std::endl;

#define TAMAÑO_BLOQUE 64
#define ANCHO_BLOQUE 64
#define TILE_WIDTH 64

__global__ void MatrixMulKernel(float* M, float* N, float* P,
int Width) {
  // Calculate the row index of the P element and M
  int Row = blockIdx.y*blockDim.y+threadIdx.y;
  // Calculate the column index of P and N
  int Col = blockIdx.x*blockDim.x+threadIdx.x;
  if ((Row < Width) && (Col < Width)) {
  float Pvalue = 0;
  // each thread computes one element of the block sub-matrix
  for (int k = 0; k < Width; ++k) {
    Pvalue += M[Row*Width+k]*N[k*Width+Col];
  }
  P[Row*Width+Col] = Pvalue;
  }
}

__global__ void MatrixMulKerneltiled(float* d_M, float* d_N, float* d_P,
int Width) {
  __shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
  __shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
  int bx = blockIdx.x; int by = blockIdx.y;
  int tx = threadIdx.x; int ty = threadIdx.y;
  // Identify the row and column of the d_P element to work on
  int Row = by * TILE_WIDTH + ty;
  int Col = bx * TILE_WIDTH + tx;
  float Pvalue = 0;
  // Loop over the d_M and d_N tiles required to compute d_P element
  for (int ph = 0; ph < Width/TILE_WIDTH; ++ph) {
    // Collaborative loading of d_M and d_N tiles into shared memory
    Mds[ty][tx] = d_M[Row*Width + ph*TILE_WIDTH + tx];
    Nds[ty][tx] = d_N[(ph*TILE_WIDTH + ty)*Width + Col];
    __syncthreads();
    for (int k = 0; k < TILE_WIDTH; ++k) {
      Pvalue += Mds[ty][k] * Nds[k][tx];
    }
    __syncthreads();
  }
  d_P[Row*Width + Col] = Pvalue;
}

void Cudainit1(float* h_A,float* h_B, float* h_C,int width){
    int tamaño = width * width * sizeof(float);
    float* d_A;
    float* d_B;
    float* d_C;
    cudaMalloc((void**)&d_A, tamaño);
    cudaMalloc((void**)&d_B, tamaño);
    cudaMalloc((void**)&d_C, tamaño);

    // transferir a la GPU
    cudaMemcpy(d_A, h_A, tamaño, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, tamaño, cudaMemcpyHostToDevice);

    // lanzar el kernel
    dim3 hilosporbloque(32, 32);
    dim3 bloquesPorGrid(((tamaño + hilosporbloque.x - 1) / hilosporbloque.x, (tamaño + hilosporbloque.y - 1) / hilosporbloque.y));

    MatrixMulKernel<<<bloquesPorGrid, hilosporbloque>>>(d_A, d_B, d_C, width);
    cudaMemcpy(h_C, d_C, tamaño, cudaMemcpyDeviceToHost);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}

void Cudainit2(float* h_A,float* h_B, float* h_C,int width){
    int tamaño = width * width * sizeof(float);
    float* d_A;
    float* d_B;
    float* d_C;
    cudaMalloc((void**)&d_A, tamaño);
    cudaMalloc((void**)&d_B, tamaño);
    cudaMalloc((void**)&d_C, tamaño);

    // transferir a la GPU
    cudaMemcpy(d_A, h_A, tamaño, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, tamaño, cudaMemcpyHostToDevice);

    // lanzar el kernel
    dim3 hilosporbloque(32, 32);
    dim3 bloquesPorGrid(((tamaño + hilosporbloque.x - 1) / hilosporbloque.x, (tamaño + hilosporbloque.y - 1) / hilosporbloque.y));

    MatrixMulKerneltiled<<<bloquesPorGrid, hilosporbloque>>>(d_A, d_B, d_C, width);
    // transferir a la CPU
    cudaMemcpy(h_C, d_C, tamaño, cudaMemcpyDeviceToHost);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}

int main() {
    int width=10192;
    int tamaño = width * width * sizeof(float);

    // asignar memoria en el host
    float* h_A = (float*)malloc(tamaño);
    float* h_B = (float*)malloc(tamaño);
    float* h_C = (float*)malloc(tamaño);

    // inicializar matrices en el host
    int i, j;
    for (i = 0; i <  width; i++) {
        for (j = 0; j < width; j++) {
            h_A[i*width+j] =rand()%100;
            h_B[i*width+j] =rand()%100;
        }
    }


    auto start1 = high_resolution_clock::now();
    Cudainit1(h_A,h_B,h_C,width);
    auto end1 = high_resolution_clock::now();
    auto elapsed_time1 = duration_cast<milliseconds>(end1 - start1);
     std::cout<<"\nMultiplicacion normal demoro: "<<elapsed_time1.count()<<std::endl;


    //for (int i = 0; i < tamaño/sizeof(float); i++) {
      //  std::cout<<"|"<<h_C[i]<<"|";
    //}
    std::cout<<"\n";

    float* h_C2 = (float*)malloc(tamaño);
    auto start2 = high_resolution_clock::now();
    Cudainit2(h_A,h_B,h_C2,width);
    auto end2 = high_resolution_clock::now();
    auto elapsed_time2 = duration_cast<milliseconds>(end2 - start2);
    std::cout<<"Multiplicacion por tiles: "<<elapsed_time2.count()<<std::endl;

    //for (int i = 0; i < tamaño/sizeof(float); i++) {
      //  std::cout<<"|"<<h_C2[i]<<"|";
    //``}

    free(h_A); free(h_B); free(h_C); free(h_C2);
}


Multiplicacion normal demoro: 871

Multiplicacion por tiles: 555

