In [1]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-6wmldgey
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-6wmldgey
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4290 sha256=c43a4ca6e2b0fc8628a15c1c7e014c90aaa80c91a1ed30ef4491added9eb1f1e
  Stored in directory: /tmp/pip-ephem-wheel-cache-tmwnq5t7/wheels/bc/4e/e0/2d86bd15f671dbeb32144013f1159dba09757fde36dc51a963
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [4]:
%%writefile matrix.cu
#include <iostream>
#include <cuda.h>
#include <chrono>

using namespace std;
using namespace std::chrono;


__global__ void multiply(int* A, int* B, int* C, int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < M && col < K) {
        int sum = 0;
        for (int i = 0; i < N; i++) {
            sum += A[row * N + i] * B[i * K + col];
        }
        C[row * K + col] = sum;
    }
}

void initialize(int* matrix, int rows, int cols) {
    for (int i = 0; i < rows * cols; i++) {
        cout << "Enter element " << i + 1 << ": ";
        cin >> matrix[i];
    }
}

void print(int* matrix, int rows, int cols) {
    for (int row = 0; row < rows; row++) {
        for (int col = 0; col < cols; col++) {
            cout << matrix[row * cols + col] << " ";
        }
        cout << '\n';
    }
    cout << '\n';
}

void sequentialMultiply(int* A, int* B, int* C, int M, int N, int K) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < K; j++) {
            int sum = 0;
            for (int k = 0; k < N; k++) {
                sum += A[i * N + k] * B[k * K + j];
            }
            C[i * K + j] = sum;
        }
    }
}

int main() {
    int M, N, K;
    cout << "Enter the number of rows and columns of the first matrix: ";
    cin >> M >> N;
    cout << "Enter the number of columns of the second matrix: ";
    cin >> K;

    int* A, * B, * C;

    int matrixSize = M * K;
    size_t matrixBytes = matrixSize * sizeof(int);

    A = new int[M * N];
    B = new int[N * K];
    C = new int[M * K];

    initialize(A, M, N);
    initialize(B, N, K);

    cout << "Matrix A: \n";
    print(A, M, N);

    cout << "Matrix B: \n";
    print(B, N, K);

    int* X, * Y, * Z;
    cudaMalloc(&X, M * N * sizeof(int));
    cudaMalloc(&Y, N * K * sizeof(int));
    cudaMalloc(&Z, M * K * sizeof(int));

    cudaMemcpy(X, A, M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, N * K * sizeof(int), cudaMemcpyHostToDevice);

    int THREADS = 16;
    int BLOCKS = (M + THREADS - 1) / THREADS;

    dim3 threads(THREADS, THREADS);
    dim3 blocks(BLOCKS, BLOCKS);

    // Sequential multiplication
    auto start = high_resolution_clock::now();
    sequentialMultiply(A, B, C, M, N, K);
    auto stop = high_resolution_clock::now();
    auto seq_duration = duration_cast<microseconds>(stop - start);

    cout << "Sequential Multiplication of matrix A and B: \n";
    print(C, M, K);

    // Parallel multiplication
    start = high_resolution_clock::now();
    multiply<<<blocks, threads>>>(X, Y, Z, M, N, K);
    cudaMemcpy(C, Z, M * K * sizeof(int), cudaMemcpyDeviceToHost);
    stop = high_resolution_clock::now();
    auto par_duration = duration_cast<microseconds>(stop - start);

    cout << "Parallel Multiplication of matrix A and B: \n";
    print(C, M, K);

    cout << "Sequential Multiplication Time: " << seq_duration.count() << " microseconds" << endl;
    cout << "Parallel Multiplication Time: " << par_duration.count() << " microseconds" << endl;

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}


Writing matrix.cu


In [5]:
!nvcc matrix.cu -o matrix
!./matrix


      size_t matrixBytes = matrixSize * sizeof(int);
             ^


Enter the number of rows and columns of the first matrix: 2 2
Enter the number of columns of the second matrix: 2
Enter element 1: 12
Enter element 2: 23
Enter element 3: 12
Enter element 4: 343
Enter element 1: 34
Enter element 2: 34
Enter element 3: 45
Enter element 4: 1
Matrix A: 
12 23 
12 343 

Matrix B: 
34 34 
45 1 

Sequential Multiplication of matrix A and B: 
1443 431 
15843 751 

Parallel Multiplication of matrix A and B: 
1443 431 
15843 751 

Sequential Multiplication Time: 0 microseconds
Parallel Multiplication Time: 6 microseconds


In [6]:
%%writefile vector.cu
#include <iostream>
#include <cuda.h>
#include <chrono>

using namespace std;
using namespace std::chrono;

__global__ void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void initialize(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << "Enter element " << i + 1 << " of the vector: ";
        cin >> vector[i];
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

void sequentialAddition(int* A, int* B, int* C, int size) {
    for (int i = 0; i < size; i++) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    int N;
    cout << "Enter the size of the vectors: ";
    cin >> N;

    int* A, * B, * C;

    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    initialize(A, vectorSize);
    initialize(B, vectorSize);

    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Sequential addition
    auto start = high_resolution_clock::now();
    sequentialAddition(A, B, C, N);
    auto stop = high_resolution_clock::now();
    auto seq_duration = duration_cast<microseconds>(stop - start);

    cout << "Sequential Addition: ";
    print(C, N);

    // Parallel addition
    start = high_resolution_clock::now();
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);
    stop = high_resolution_clock::now();
    auto par_duration = duration_cast<microseconds>(stop - start);

    cout << "Parallel Addition: ";
    print(C, N);

    cout << "Sequential Addition Time: " << seq_duration.count() << " microseconds" << endl;
    cout << "Parallel Addition Time: " << par_duration.count() << " microseconds" << endl;

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}


Writing vector.cu


In [7]:
!nvcc vector.cu -o vector
!./vector

Enter the size of the vectors: 3
Enter element 1 of the vector: 12
Enter element 2 of the vector: 1
Enter element 3 of the vector: 1
Enter element 1 of the vector: 1
Enter element 2 of the vector: 12
Enter element 3 of the vector: 1
Vector A: 12 1 1 
Vector B: 1 12 1 
Sequential Addition: 13 13 2 
Parallel Addition: 13 13 2 
Sequential Addition Time: 0 microseconds
Parallel Addition Time: 15 microseconds
