In [None]:
!nvcc --version
!pip install git+https://github.com/afnan47/cuda.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-yyy4jrxu
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-yyy4jrxu
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=03d5e0ad41d6f8a9c229c75e696f2ea037a09ce2fa737baf6f5f49a53d034cf9
  Stored in directory: /tmp/pip-ephem-wheel-cache-ejkbww9r/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e

In [None]:
%%writefile breadthfirst.cu
#include <iostream>
#include <queue>
#include <vector>
#include <omp.h>

using namespace std;

int main() {
    int num_vertices, num_edges, source;
    cout << "Enter number of vertices, edges, and source node: ";
    cin >> num_vertices >> num_edges >> source;

    // Input validation
    if (source < 1 || source > num_vertices) {
        cout << "Invalid source node!" << endl;
        return 1;
    }

    vector<vector<int>> adj_list(num_vertices + 1);
    for (int i = 0; i < num_edges; i++) {
        int u, v;
        cin >> u >> v;
        // Input validation for edges
        if (u < 1 || u > num_vertices || v < 1 || v > num_vertices) {
            cout << "Invalid edge: " << u << " " << v << endl;
            return 1;
        }
        adj_list[u].push_back(v);
        adj_list[v].push_back(u);
    }

    queue<int> q;
    vector<bool> visited(num_vertices + 1, false);
    q.push(source);
    visited[source] = true;

    while (!q.empty()) {
        int curr_vertex = q.front();
        q.pop();
        cout << curr_vertex << " ";

        // Sequential loop for neighbors
        for (int i = 0; i < adj_list[curr_vertex].size(); i++) {
            int neighbour = adj_list[curr_vertex][i];
            if (!visited[neighbour]) {
                visited[neighbour] = true;
                q.push(neighbour);
            }
        }
    }

    cout << endl;
    return 0;
}

Writing breadthfirst.cu


In [None]:
!nvcc breadthfirst.cu -o breadthfirst
!./breadthfirst

Enter number of vertices, edges, and source node: 5 4 1
1 2
1 3
2 4
3 5
1 2 3 4 5 


In [None]:
%%writefile depthfirst.cu
#include <iostream>
#include <vector>
#include <omp.h>
using namespace std;
const int MAXN = 1e5;
vector<int> adj[MAXN+5]; // adjacency list
bool visited[MAXN+5]; // mark visited nodes
void dfs(int node) {
 visited[node] = true;
 #pragma omp parallel for
 for (int i = 0; i < adj[node].size(); i++) {
 int next_node = adj[node][i];
 if (!visited[next_node]) {
 dfs(next_node);
 }
 }
}
int main() {
 cout << "Please enter nodes and edges";
 int n, m; // number of nodes and edges
 cin >> n >> m;
 for (int i = 1; i <= m; i++) {
 int u, v; // edge between u and v
 cin >> u >> v;
 adj[u].push_back(v);
 adj[v].push_back(u);
 }
 int start_node; // start node of DFS
 cin >> start_node;
 dfs(start_node);
 // Print visited nodes
 for (int i = 1; i <= n; i++) {
 if (visited[i]) {
 cout << i << " ";
 }
 }
 cout << endl;
 return 0;
}

Writing depthfirst.cu


In [None]:
!nvcc depthfirst.cu -o depthfirst
!./depthfirst

Please enter nodes and edges5 6
1 2
1 3
2 3
2 4
3 4
4 5
^C


In [None]:
%%writefile sum.cu
#include <iostream>
#include <vector>
#include <climits>

__global__ void min_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMin(result, arr[tid]);
    }
}

__global__ void max_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMax(result, arr[tid]);
    }
}

__global__ void sum_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicAdd(result, arr[tid]);
    }
}

__global__ void average_reduction_kernel(int* arr, int size, int* sum) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicAdd(sum, arr[tid]);
    }
}

int main() {
    std::vector<int> arr = {5, 2, 9, 1, 7, 6, 8, 3, 4};
    int size = arr.size();
    int* d_arr;
    int* d_result;
    int result_min = INT_MAX;
    int result_max = INT_MIN;
    int result_sum = 0;

    // Allocate memory on the device
    cudaMalloc(&d_arr, size * sizeof(int));
    cudaMalloc(&d_result, sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_arr, arr.data(), size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result_min, sizeof(int), cudaMemcpyHostToDevice);

    // Perform min reduction
    min_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_min, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Minimum value: " << result_min << std::endl;

    // Perform max reduction
    cudaMemcpy(d_result, &result_max, sizeof(int), cudaMemcpyHostToDevice);
    max_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_max, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Maximum value: " << result_max << std::endl;

    // Perform sum reduction
    cudaMemcpy(d_result, &result_sum, sizeof(int), cudaMemcpyHostToDevice);
    sum_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_sum, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << result_sum << std::endl;

    // Perform average reduction
    cudaMemcpy(d_result, &result_sum, sizeof(int), cudaMemcpyHostToDevice);
    average_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_sum, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Average: " << static_cast<double>(result_sum) / size << std::endl;

    // Free device memory
    cudaFree(d_arr);
    cudaFree(d_result);

    return 0;
}

Writing sum.cu


In [None]:
!nvcc sum.cu -o sum
!./sum

Minimum value: 2147483647
Maximum value: -2147483648
Sum: 0
Average: 0


In [None]:
%%writefile add.cu
#include <iostream>
#include <cstdlib> // Include <cstdlib> for rand()
using namespace std;

__global__
void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void initialize(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = rand() % 10;
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

int main() {
    int N = 4;
    int* A, * B, * C;
    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    // Allocate host memory
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    // Initialize host arrays
    initialize(A, vectorSize);
    initialize(B, vectorSize);
    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;
    // Allocate device memory
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // Check for CUDA memory allocation errors
    if (X == nullptr || Y == nullptr || Z == nullptr) {
        cerr << "CUDA memory allocation failed" << endl;
        return 1;
    }

    // Copy data from host to device
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

    // Check for kernel launch errors
    cudaError_t kernelLaunchError = cudaGetLastError();
    if (kernelLaunchError != cudaSuccess) {
        cerr << "CUDA kernel launch failed: " << cudaGetErrorString(kernelLaunchError) << endl;
        return 1;
    }

    // Copy result from device to host
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

    // Check for CUDA memcpy errors
    cudaError_t memcpyError = cudaGetLastError();
    if (memcpyError != cudaSuccess) {
        cerr << "CUDA memcpy failed: " << cudaGetErrorString(memcpyError) << endl;
        return 1;
    }

    cout << "Addition: ";
    print(C, N);

    // Free device memory
    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}

Overwriting add.cu


In [None]:
!nvcc add.cu -o add
!./add

Vector A: 3 6 7 5 
Vector B: 3 5 6 2 
CUDA memory allocation failed


In [None]:
%%writefile matrix_mult.cu
#include <iostream>
#include <cuda.h>
using namespace std;

#define BLOCK_SIZE 2

__global__ void gpuMM(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.f;
    for (int n = 0; n < N; ++n)
        sum += A[row * N + n] * B[n * N + col];
    C[row * N + col] = sum;
}

int main(int argc, char *argv[]) {
    int N;
    float K;

    // Perform matrix multiplication C = A*B
    // where A, B and C are NxN matrices
    // Restricted to matrices where N = K*BLOCK_SIZE;

    cout << "Enter a value for size/2 of matrix: ";
    cin >> K;
    K = 1;
    N = K * BLOCK_SIZE;
    cout << "\nExecuting Matrix Multiplication" << endl;
    cout << "Matrix size: " << N << "x" << N << endl;

    // Allocate memory on the host
    float *hA, *hB, *hC;
    hA = new float[N * N];
    hB = new float[N * N];
    hC = new float[N * N];

    // Initialize matrices on the host with random values
    srand(time(NULL)); // Seed the random number generator
    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) {
            hA[j * N + i] = rand() % 10; // Generate random value between 0 and 9
            hB[j * N + i] = rand() % 10; // Generate random value between 0 and 9
        }
    }

    // Allocate memory on the device
    int size = N * N * sizeof(float);
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);

    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(K, K);

    // Copy matrices from the host to device
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);

    // Execute the matrix multiplication kernel
    gpuMM<<<grid, threadBlock>>>(dA, dB, dC, N);

    // Copy the GPU result back to CPU
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    // Display the result
    cout << "\nResultant matrix:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            cout << hC[row * N + col] << " ";
        }
        cout << endl;
    }

    // Free device memory
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    // Free host memory
    delete[] hA;
    delete[] hB;
    delete[] hC;

    cout << "Finished." << endl;
    return 0;
}

Writing matrix_mult.cu


In [None]:
!nvcc matrix_mult.cu -o matrix_mult
!./matrix_mult

Enter a value for size/2 of matrix: 6

Executing Matrix Multiplication
Matrix size: 2x2

Resultant matrix:
0 0 
0 0 
Finished.


In [None]:
%%writefile mergesort.cu
#include <iostream>
#include <cuda_runtime.h>

__device__ void merge(int* arr, int left, int mid, int right) {
    int i, j, k;
    int n1 = mid - left + 1;
    int n2 = right - mid;

    int* L = new int[n1];
    int* R = new int[n2];

    for (i = 0; i < n1; i++)
        L[i] = arr[left + i];
    for (j = 0; j < n2; j++)
        R[j] = arr[mid + 1 + j];

    i = 0;
    j = 0;
    k = left;
    while (i < n1 && j < n2) {
        if (L[i] <= R[j]) {
            arr[k] = L[i];
            i++;
        } else {
            arr[k] = R[j];
            j++;
        }
        k++;
    }

    while (i < n1) {
        arr[k] = L[i];
        i++;
        k++;
    }

    while (j < n2) {
        arr[k] = R[j];
        j++;
        k++;
    }

    delete[] L;
    delete[] R;
}

__global__ void mergeSort(int* arr, int left, int right) {
    if (left < right) {
        int mid = left + (right - left) / 2;
        mergeSort<<<1,1>>>(arr, left, mid);
        mergeSort<<<1,1>>>(arr, mid + 1, right);
        merge<<<1,1>>>(arr, left, mid, right);
    }
}

void mergeSortCUDA(int* arr, int size) {
    int* arr_dev;
    cudaMalloc((void**)&arr_dev, size * sizeof(int));
    cudaMemcpy(arr_dev, arr, size * sizeof(int), cudaMemcpyHostToDevice);
    mergeSort<<<1,1>>>(arr_dev, 0, size - 1);
    cudaMemcpy(arr, arr_dev, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(arr_dev);
}

int main() {
    const int size = 10;
    int arr[size] = {12, 11, 13, 5, 6, 7, 1, 3, 8, 9};

    mergeSortCUDA(arr, size);

    std::cout << "Sorted array: \n";
    for (int i = 0; i < size; i++)
        std::cout << arr[i] << " ";
    std::cout << std::endl;

    return 0;
}


Overwriting mergesort.cu


In [None]:
!nvcc mergesort.cu -o mergesort
!./mergesort

[01m[0m[01mmergesort.cu(52)[0m: [01;31merror[0m: a __device__ function call cannot be configured
          merge<<<1,1>>>(arr, left, mid, right);
          ^

1 error detected in the compilation of "mergesort.cu".
/bin/bash: line 1: ./mergesort: No such file or directory


In [23]:
%%writefile matrix_multi.cu
#include <iostream>
#include <cuda_runtime.h>

using namespace std;

const int N = 2;

__global__ void matrixMultiply(int* A, int* B, int* C) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < N && col < N) {
        int sum = 0;
        for (int i = 0; i < N; ++i) {
            sum += A[row * N + i] * B[i * N + col];
        }
        C[row * N + col] = sum;
    }
}

int main() {
    int* A, * B, * C;
    size_t matrixBytes = N * N * sizeof(int);

    A = new int[N * N];
    B = new int[N * N];
    C = new int[N * N];

    auto input = [&](int* matrix) {
        cout << "Enter elements of Matrix (" << N << "x" << N << "):" << endl;
        for (int i = 0; i < N * N; ++i) cin >> matrix[i];
    };

    input(A);
    input(B);

    int* X, * Y, * Z;
    cudaMalloc(&X, matrixBytes);
    cudaMalloc(&Y, matrixBytes);
    cudaMalloc(&Z, matrixBytes);

    cudaMemcpy(X, A, matrixBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, matrixBytes, cudaMemcpyHostToDevice);

    matrixMultiply<<<1, dim3(N, N)>>>(X, Y, Z);

    cudaMemcpy(C, Z, matrixBytes, cudaMemcpyDeviceToHost);

    cout << "Output- Matrix size: " << N << "x" << N << endl;
    cout << "Input Matrix 1:" << endl;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) cout << A[i * N + j] << " ";
        cout << endl;
    }

    cout << "Input Matrix 2:" << endl;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) cout << B[i * N + j] << " ";
        cout << endl;
    }

    cout << "Resultant matrix:" << endl;
    for (int i = 0; i < N; ++i) {
        for (int j = 0; j < N; ++j) cout << C[i * N + j] << " ";
        cout << endl;
    }

    cout << "Finished." << endl;

    delete[] A;
    delete[] B;
    delete[] C;

    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    return 0;
}

Writing matrix_multi.cu


In [25]:
!nvcc matrix_multi.cu -o matrix_multi
!./matrix_multi

Enter elements of Matrix (2x2):
2 2
2 2
Enter elements of Matrix (2x2):
4 4
4 4
Output- Matrix size: 2x2
Input Matrix 1:
2 2 
2 2 
Input Matrix 2:
4 4 
4 4 
Resultant matrix:
0 0 
0 0 
Finished.


In [26]:
%%writefile bubblesort.cu
#include <iostream>
#include <vector>
#include <chrono>

__global__ void bubbleSortParallel(int* arr, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n - 1) {
        if (arr[idx] > arr[idx + 1]) {
            int temp = arr[idx];
            arr[idx] = arr[idx + 1];
            arr[idx + 1] = temp;
        }
    }
}

void bubbleSortSerial(std::vector<int>& arr) {
    int n = arr.size();
    bool swapped = true;
    while (swapped) {
        swapped = false;
        for (int i = 0; i < n - 1; i++) {
            if (arr[i] > arr[i + 1]) {
                std::swap(arr[i], arr[i + 1]);
                swapped = true;
            }
        }
    }
}

int main() {
    int n = 10000;
    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

    std::vector<int> arr(n);

    // Initialize array with random values
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 10000;
    }

    // Measure serial Bubble Sort performance
    auto start = std::chrono::high_resolution_clock::now();
    bubbleSortSerial(arr);
    auto stop = std::chrono::high_resolution_clock::now();
    auto durationSerial = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

    std::cout << "Serial Bubble Sort took " << durationSerial.count() << " milliseconds." << std::endl;

    // Reset array for parallel sort
    for (int i = 0; i < n; i++) {
        arr[i] = rand() % 10000;
    }

    int* d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, arr.data(), n * sizeof(int), cudaMemcpyHostToDevice);

    // Measure parallel Bubble Sort performance
    start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < n; i++) {
        bubbleSortParallel<<<num_blocks, block_size>>>(d_arr, n);
        cudaDeviceSynchronize();
    }
    stop = std::chrono::high_resolution_clock::now();
    auto durationParallel = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);

    std::cout << "Parallel Bubble Sort took " << durationParallel.count() << " milliseconds." << std::endl;

    cudaMemcpy(arr.data(), d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);

    return 0;
}

Writing bubblesort.cu


In [27]:
!nvcc bubblesort.cu -o bubblesort
!./bubblesort

Serial Bubble Sort took 1536 milliseconds.
Parallel Bubble Sort took 1 milliseconds.


In [28]:
%%writefile mergesort.cu
#include <iostream>
#include <vector>
#include <chrono>

// Serial merge sort implementation
void merge(int* arr, int l, int m, int r) {
    // Merge logic
}

void mergeSort(int* arr, int l, int r) {
    // Merge sort logic
}

// Parallel merge sort implementation
__global__ void mergeSortParallel(int* arr, int l, int r) {
    // Merge sort logic
}

int main() {
    int n = 10000;
    int block_size = 256;
    int num_blocks = (n + block_size - 1) / block_size;

    std::vector<int> arr_serial(n);
    std::vector<int> arr_parallel(n);

    // Initialize arrays with random values
    // Copy values from arr_serial to arr_parallel for comparison

    // Serial merge sort
    auto start_serial = std::chrono::high_resolution_clock::now();
    mergeSort(arr_serial.data(), 0, n - 1);
    auto end_serial = std::chrono::high_resolution_clock::now();

    // Parallel merge sort
    int* d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, arr_parallel.data(), n * sizeof(int), cudaMemcpyHostToDevice);

    auto start_parallel = std::chrono::high_resolution_clock::now();
    mergeSortParallel<<<num_blocks, block_size>>>(d_arr, 0, n - 1);
    cudaDeviceSynchronize();
    auto end_parallel = std::chrono::high_resolution_clock::now();

    cudaMemcpy(arr_parallel.data(), d_arr, n * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);

    // Print timing information
    std::chrono::duration<double, std::milli> duration_serial = end_serial - start_serial;
    std::cout << "Serial Merge Sort took " << duration_serial.count() << " milliseconds." << std::endl;

    std::chrono::duration<double, std::milli> duration_parallel = end_parallel - start_parallel;
    std::cout << "Parallel Merge Sort took " << duration_parallel.count() << " milliseconds." << std::endl;

    return 0;
}

Overwriting mergesort.cu


In [30]:
!nvcc mergesort.cu -o mergesort
!./mergesort

Serial Merge Sort took 0.000175 milliseconds.
Parallel Merge Sort took 0.007277 milliseconds.
