In [40]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [41]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-3rm4q5f0
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-3rm4q5f0
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run

In [42]:
%load_ext nvcc_plugin

The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [47]:
 %%writefile add.cu
#include <iostream>
#include <ctime>   // Include <ctime> for time()
using namespace std;

__global__
void add(int* A, int* B, int* C, int size)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size)
    {
        C[tid] = A[tid] + B[tid];
    }
}

int main() {
    int N;
    cout << "Enter the size of vectors: ";
    cin >> N;

    int* A, * B, * C;
    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    // Allocate host memory
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

/*
    //To generate random input values for vectors with user-defined size, uncomment the following code block and comment out the code block for manual input.



    // Generate random input values
    srand(time(NULL));
    cout << "Random elements for vector A: ";
    for (int i = 0; i < N; ++i) {
        A[i] = rand() % 100; // Generate random values between 0 and 99
        cout << A[i] << " ";
    }
    cout << endl;

    cout << "Random elements for vector B: ";
    for (int i = 0; i < N; ++i) {
        B[i] = rand() % 100; // Generate random values between 0 and 99
        cout << B[i] << " ";
    }
    cout << endl;

*/

    // Manual input for vector values
    cout << "Enter elements of vector A: ";
    for (int i = 0; i < N; ++i) {
        cin >> A[i];
    }

    cout << "Enter elements of vector B: ";
    for (int i = 0; i < N; ++i) {
        cin >> B[i];
    }

    cout << "Vector A: ";
    for (int i = 0; i < N; ++i) {
        cout << A[i] << " ";
    }
    cout << endl;

    cout << "Vector B: ";
    for (int i = 0; i < N; ++i) {
        cout << B[i] << " ";
    }
    cout << endl;

    int* X, * Y, * Z;

    // Allocate device memory
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // Check for CUDA memory allocation errors
    if (X == nullptr || Y == nullptr || Z == nullptr) {
        cerr << "CUDA memory allocation failed" << endl;
        return 1;
    }

    // Copy data from host to device
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    clock_t start_time = clock();
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);
    cudaDeviceSynchronize(); // Wait for all threads to finish
    clock_t end_time = clock();

    // Check for kernel launch errors
    cudaError_t kernelLaunchError = cudaGetLastError();
    if (kernelLaunchError != cudaSuccess) {
        cerr << "CUDA kernel launch failed: " << cudaGetErrorString(kernelLaunchError);
        return 1;
    }

    // Copy result from device to host
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

    // Check for CUDA memcpy errors
    cudaError_t memcpyError = cudaGetLastError();
    if (memcpyError != cudaSuccess)
    {
        cerr << "CUDA memcpy failed: " << cudaGetErrorString(memcpyError) << endl;
        return 1;
    }

    cout << "Addition: ";
    for (int i = 0; i < N; ++i) {
        cout << C[i] << " ";
    }
    cout << endl;

    // Free device memory
    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    double time_taken = double(end_time - start_time) / CLOCKS_PER_SEC;
    cout << "Time taken: " << time_taken << " seconds" << endl;
    cout << "Number of threads used: " << blocksPerGrid * threadsPerBlock << endl;

    return 0;
}


Overwriting add.cu


In [48]:
!nvcc add.cu -o add

In [49]:
!./add

Enter the size of vectors: 3
Enter elements of vector A: 1 2 3
Enter elements of vector B: 4 5 6
Vector A: 1 2 3 
Vector B: 4 5 6 
Addition: 5 7 9 
Time taken: 0.000163 seconds
Number of threads used: 256


In [53]:
 %%writefile matrix.cu
#include <iostream>
#include <limits>
#include <cuda.h>
using namespace std;

#define BLOCK_SIZE 2

__global__ void gpuMM(float *A, float *B, float *C, int N)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.f;
    for (int n = 0; n < N; ++n)
        sum += A[row * N + n] * B[n * N + col];
    C[row * N + col] = sum;
}

int main(int argc, char *argv[])
{
    int N;
    cout << "Enter the size of the matrix: ";
    cin >> N;
    N *= 2; // Multiply by 2 to make the matrix size equal to user input * BLOCK_SIZE
    cout << "\nExecuting Matrix Multiplication" << endl;
    cout << "Matrix size: " << N << "x" << N << endl;

    // Allocate memory on the host
    float *hA, *hB, *hC;
    hA = new float[N * N];
    hB = new float[N * N];
    hC = new float[N * N];
  /*
    //To generate random input values for matrices with user-defined size, uncomment the following code block and comment out the code block for manual input.

    // Generate random input values for matrix A
    cout << "Random elements for matrix A: ";
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            hA[j * N + i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
            cout << hA[j * N + i] << " ";
        }
        cout << endl;
    }

    // Generate random input values for matrix B
    cout << "Random elements for matrix B: ";
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            hB[j * N + i] = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
            cout << hB[j * N + i] << " ";
        }
        cout << endl;
    }
    */
    // Initialize matrices on the host with user input
    cout << "Enter elements of matrix A: ";
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            cin >> hA[j * N + i];
        }
    }
    cin.ignore(numeric_limits<streamsize>::max(), '\n'); // Clear input buffer

    cout << "Enter elements of matrix B: ";
    for (int j = 0; j < N; j++)
    {
        for (int i = 0; i < N; i++)
        {
            cin >> hB[j * N + i];
        }
    }
    cin.ignore(numeric_limits<streamsize>::max(), '\n'); // Clear input buffer

    // Allocate memory on the device
    int size = N * N * sizeof(float);
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(N / BLOCK_SIZE, N / BLOCK_SIZE);

    // Copy matrices from the host to device
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);

    // Execute the matrix multiplication kernel
    gpuMM<<<grid, threadBlock>>>(dA, dB, dC, N);

    // Copy the GPU result back to CPU
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    // Display the result
    cout << "\nResultant matrix:\n";
    for (int row = 0; row < N; row++)
    {
        for (int col = 0; col < N; col++)
        {
            cout << hC[row * N + col] << " ";
        }
        cout << endl;
    }

    // Free device memory
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    // Free host memory
    delete[] hA;
    delete[] hB;
    delete[] hC;

    cout << "Finished." << endl;
    return 0;
}


Overwriting matrix.cu


In [54]:
!nvcc matrix.cu -o matrix

In [55]:
!./matrix

Enter the size of the matrix: 1

Executing Matrix Multiplication
Matrix size: 2x2
Random elements for matrix A: 0.840188 0.394383 
0.783099 0.79844 
Random elements for matrix B: 0.911647 0.197551 
0.335223 0.76823 

Resultant matrix:
0.898161 0.468957 
0.981566 0.768088 
Finished.
