<a href="https://colab.research.google.com/github/Shradha1304/myrepo/blob/main/Practical_All.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Check CUDA version
!nvcc --version

# Install CUDA package
!pip install git+https://github.com/afnan47/cuda.git

# Load nvcc plugin
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-moeqs6ck
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-moeqs6ck
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=64403fbca50b2ecc8e23b11c6292c7583c5b87ededcb81ba3c97506ab6131921
  Stored in directory: /tmp/pip-ephem-wheel-cache-dkhabgmi/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e

In [None]:
%%writefile add.cu
#include <iostream>
#include <cstdlib> // Include <cstdlib> for rand()
using namespace std;

__global__
void add(int* A, int* B, int* C, int size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        C[tid] = A[tid] + B[tid];
    }
}

void initialize(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        vector[i] = rand() % 10;
    }
}

void print(int* vector, int size) {
    for (int i = 0; i < size; i++) {
        cout << vector[i] << " ";
    }
    cout << endl;
}

int main() {
    int N = 4;
    int* A, * B, * C;
    int vectorSize = N;
    size_t vectorBytes = vectorSize * sizeof(int);

    // Allocate host memory
    A = new int[vectorSize];
    B = new int[vectorSize];
    C = new int[vectorSize];

    // Initialize host arrays
    initialize(A, vectorSize);
    initialize(B, vectorSize);
    cout << "Vector A: ";
    print(A, N);
    cout << "Vector B: ";
    print(B, N);

    int* X, * Y, * Z;
    // Allocate device memory
    cudaMalloc(&X, vectorBytes);
    cudaMalloc(&Y, vectorBytes);
    cudaMalloc(&Z, vectorBytes);

    // Check for CUDA memory allocation errors
    if (X == nullptr || Y == nullptr || Z == nullptr) {
        cerr << "CUDA memory allocation failed" << endl;
        return 1;
    }

    // Copy data from host to device
    cudaMemcpy(X, A, vectorBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(Y, B, vectorBytes, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    // Launch kernel
    add<<<blocksPerGrid, threadsPerBlock>>>(X, Y, Z, N);

    // Check for kernel launch errors
    cudaError_t kernelLaunchError = cudaGetLastError();
    if (kernelLaunchError != cudaSuccess) {
        cerr << "CUDA kernel launch failed: " << cudaGetErrorString(kernelLaunchError) << endl;
        return 1;
    }

    // Copy result from device to host
    cudaMemcpy(C, Z, vectorBytes, cudaMemcpyDeviceToHost);

    // Check for CUDA memcpy errors
    cudaError_t memcpyError = cudaGetLastError();
    if (memcpyError != cudaSuccess) {
        cerr << "CUDA memcpy failed: " << cudaGetErrorString(memcpyError) << endl;
        return 1;
    }

    cout << "Addition: ";
    print(C, N);

    // Free device memory
    cudaFree(X);
    cudaFree(Y);
    cudaFree(Z);

    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;

    return 0;
}


Writing add.cu


In [None]:
!nvcc add.cu -o add
!./add

Vector A: 3 6 7 5 
Vector B: 3 5 6 2 
Addition: 6 11 13 7 


In [None]:
%%writefile matrix_mult.cu
#include <iostream>
#include <cuda.h>
using namespace std;

#define BLOCK_SIZE 2

__global__ void gpuMM(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.f;
    for (int n = 0; n < N; ++n)
        sum += A[row * N + n] * B[n * N + col];
    C[row * N + col] = sum;
}

int main(int argc, char *argv[]) {
    int N;
    float K;

    // Perform matrix multiplication C = A*B
    // where A, B and C are NxN matrices
    // Restricted to matrices where N = K*BLOCK_SIZE;

    cout << "Enter a value for size/2 of matrix: ";
    cin >> K;
    K = 1;
    N = K * BLOCK_SIZE;
    cout << "\nExecuting Matrix Multiplication" << endl;
    cout << "Matrix size: " << N << "x" << N << endl;

    // Allocate memory on the host
    float *hA, *hB, *hC;
    hA = new float[N * N];
    hB = new float[N * N];
    hC = new float[N * N];

    // Initialize matrices on the host with random values
    srand(time(NULL)); // Seed the random number generator
    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) {
            hA[j * N + i] = rand() % 10; // Generate random value between 0 and 9
            hB[j * N + i] = rand() % 10; // Generate random value between 0 and 9
        }
    }

    // Allocate memory on the device
    int size = N * N * sizeof(float);
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);

    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(K, K);

    // Copy matrices from the host to device
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);

    // Execute the matrix multiplication kernel
    gpuMM<<<grid, threadBlock>>>(dA, dB, dC, N);

    // Copy the GPU result back to CPU
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    // Display the result
    cout << "\nResultant matrix:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++) {
            cout << hC[row * N + col] << " ";
        }
        cout << endl;
    }

    // Free device memory
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    // Free host memory
    delete[] hA;
    delete[] hB;
    delete[] hC;

    cout << "Finished." << endl;
    return 0;
}


Writing matrix_mult.cu


In [None]:
!nvcc matrix_mult.cu -o matrix_mult
!./matrix_mult

Enter a value for size/2 of matrix: 3

Executing Matrix Multiplication
Matrix size: 2x2

Resultant matrix:
39 9 
47 12 
Finished.


In [None]:
%%writefile sum.cu
#include <iostream>
#include <vector>
#include <climits>

__global__ void min_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMin(result, arr[tid]);
    }
}

__global__ void max_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicMax(result, arr[tid]);
    }
}

__global__ void sum_reduction_kernel(int* arr, int size, int* result) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicAdd(result, arr[tid]);
    }
}

__global__ void average_reduction_kernel(int* arr, int size, int* sum) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < size) {
        atomicAdd(sum, arr[tid]);
    }
}

int main() {
    std::vector<int> arr = {5, 2, 9, 1, 7, 6, 8, 3, 4};
    int size = arr.size();
    int* d_arr;
    int* d_result;
    int result_min = INT_MAX;
    int result_max = INT_MIN;
    int result_sum = 0;

    // Allocate memory on the device
    cudaMalloc(&d_arr, size * sizeof(int));
    cudaMalloc(&d_result, sizeof(int));

    // Copy data from host to device
    cudaMemcpy(d_arr, arr.data(), size * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_result, &result_min, sizeof(int), cudaMemcpyHostToDevice);

    // Perform min reduction
    min_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_min, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Minimum value: " << result_min << std::endl;

    // Perform max reduction
    cudaMemcpy(d_result, &result_max, sizeof(int), cudaMemcpyHostToDevice);
    max_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_max, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Maximum value: " << result_max << std::endl;

    // Perform sum reduction
    cudaMemcpy(d_result, &result_sum, sizeof(int), cudaMemcpyHostToDevice);
    sum_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_sum, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Sum: " << result_sum << std::endl;

    // Perform average reduction
    cudaMemcpy(d_result, &result_sum, sizeof(int), cudaMemcpyHostToDevice);
    average_reduction_kernel<<<(size + 255) / 256, 256>>>(d_arr, size, d_result);
    cudaMemcpy(&result_sum, d_result, sizeof(int), cudaMemcpyDeviceToHost);
    std::cout << "Average: " << static_cast<double>(result_sum) / size << std::endl;

    // Free device memory
    cudaFree(d_arr);
    cudaFree(d_result);

    return 0;
}

Writing sum.cu


In [None]:
!nvcc sum.cu -o sum
!./sum

Minimum value: 1
Maximum value: 9
Sum: 45
Average: 10


In [None]:
%%writefile bu.cu
#include <iostream>
#include <vector>
#include <chrono>
using namespace std;

__device__ void device_swap(int& a, int& b) {
    int temp = a;
    a = b;
    b = temp;
}

__global__ void kernel_bubble_sort_odd_even(int* arr, int size) {
    bool isSorted = false;
    while (!isSorted) {
        isSorted = true;
        int tid = blockIdx.x * blockDim.x + threadIdx.x;
        if (tid % 2 == 0 && tid < size - 1) {
            if (arr[tid] > arr[tid + 1]) {
                device_swap(arr[tid], arr[tid + 1]);
                isSorted = false;
            }
        }
        __syncthreads(); // Synchronize threads within block

        if (tid % 2 != 0 && tid < size - 1) {
            if (arr[tid] > arr[tid + 1]) {
                device_swap(arr[tid], arr[tid + 1]);
                isSorted = false;
            }
        }
        __syncthreads(); // Synchronize threads within block
    }
}

void bubble_sort_odd_even(vector<int>& arr) {
    int size = arr.size();
    int* d_arr;
    cudaMalloc(&d_arr, size * sizeof(int));
    cudaMemcpy(d_arr, arr.data(), size * sizeof(int), cudaMemcpyHostToDevice);

    // Calculate grid and block dimensions
    int blockSize = 256;
    int gridSize = (size + blockSize - 1) / blockSize;

    // Perform bubble sort on GPU
    kernel_bubble_sort_odd_even<<<gridSize, blockSize>>>(d_arr, size);

    // Copy sorted array back to host
    cudaMemcpy(arr.data(), d_arr, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);
}

int main() {
    vector<int> arr = {5, 2, 9, 1, 7, 6, 8, 3, 4};
    double start, end;

    // Measure performance of parallel bubble sort using odd-even transposition
    start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
    bubble_sort_odd_even(arr);
    end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();

    cout << "Parallel bubble sort using odd-even transposition time: " << end - start << " milliseconds" << endl;
    return 0;
}


Writing bu.cu


In [None]:
!nvcc bu.cu -o bu
!./bu

Parallel bubble sort using odd-even transposition time: 149 milliseconds


In [None]:
%%writefile ms.cu
#include <iostream>
#include <vector>
#include <chrono>

using namespace std;

__device__ void merge(int* arr, int l, int m, int r) {
    int i, j, k;
    int n1 = m - l + 1;
    int n2 = r - m;
    int* L = new int[n1];
    int* R = new int[n2];
    for (i = 0; i < n1; i++) {
        L[i] = arr[l + i];
    }
    for (j = 0; j < n2; j++) {
        R[j] = arr[m + 1 + j];
    }
    i = 0;
    j = 0;
    k = l;
    while (i < n1 && j < n2) {
        if (L[i] <= R[j]) {
            arr[k++] = L[i++];
        } else {
            arr[k++] = R[j++];
        }
    }
    while (i < n1) {
        arr[k++] = L[i++];
    }
    while (j < n2) {
        arr[k++] = R[j++];
    }
    delete[] L;
    delete[] R;
}

__global__ void kernel_merge_sort(int* arr, int l, int r) {
    if (l < r) {
        int m = l + (r - l) / 2;
        kernel_merge_sort<<<1, 1>>>(arr, l, m);
        kernel_merge_sort<<<1, 1>>>(arr, m + 1, r);
        merge(arr, l, m, r);
    }
}

void parallel_merge_sort(vector<int>& arr) {
    int size = arr.size();
    int* d_arr;
    cudaMalloc(&d_arr, size * sizeof(int));
    cudaMemcpy(d_arr, arr.data(), size * sizeof(int), cudaMemcpyHostToDevice);
    kernel_merge_sort<<<1, 1>>>(d_arr, 0, size - 1);
    cudaMemcpy(arr.data(), d_arr, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_arr);
}

int main() {
    vector<int> arr = {5, 2, 9, 1, 7, 6, 8, 3, 4};
    double start, end;

    // Measure performance of parallel merge sort
    start = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();
    parallel_merge_sort(arr);
    end = chrono::duration_cast<chrono::milliseconds>(chrono::system_clock::now().time_since_epoch()).count();

    cout << "Parallel merge sort time: " << end - start << " milliseconds" << endl;

    return 0;
}


Writing ms.cu


In [None]:
!nvcc ms.cu -o ms
!./ms

[01m[0m[01mms.cu(42)[0m: [01;31merror[0m: kernel launch from __device__ or __global__ functions requires separate compilation mode
          kernel_merge_sort<<<1, 1>>>(arr, l, m);
          ^

[01m[0m[01mms.cu(43)[0m: [01;31merror[0m: kernel launch from __device__ or __global__ functions requires separate compilation mode
          kernel_merge_sort<<<1, 1>>>(arr, m + 1, r);
          ^

2 errors detected in the compilation of "ms.cu".
/bin/bash: line 1: ./ms: No such file or directory


In [2]:
%%writefile bfs.cu
#include <iostream>
#include <vector>
#include <queue>
#include <cuda_runtime.h>
#include <stdio.h>

#define MAX_NODES 1000
#define MAX_EDGES_PER_NODE 100

__global__ void BFS_kernel(int *adjacency_list, int *visited, int *level, int *queue, int *queue_size, int *new_queue_size) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < *queue_size) {
        int node = queue[tid];
        int start_edge = node * MAX_EDGES_PER_NODE;
        int end_edge = start_edge + MAX_EDGES_PER_NODE;

        for (int i = start_edge; i < end_edge; ++i) {
            int neighbor = adjacency_list[i];
            if (!visited[neighbor]) {
                visited[neighbor] = 1;
                level[neighbor] = level[node] + 1;
                int index = atomicAdd(new_queue_size, 1);
                queue[index] = neighbor;
            }
        }
    }
}

void BFS_parallel(int *adjacency_list, int *visited, int *level, int *queue, int source_node, int num_nodes) {
    int *d_adjacency_list, *d_visited, *d_level, *d_queue, *d_queue_size, *d_new_queue_size;
    int queue_size = 1;
    int new_queue_size = 0;

    cudaMalloc(&d_adjacency_list, num_nodes * MAX_EDGES_PER_NODE * sizeof(int));
    cudaMalloc(&d_visited, num_nodes * sizeof(int));
    cudaMalloc(&d_level, num_nodes * sizeof(int));
    cudaMalloc(&d_queue, num_nodes * sizeof(int));
    cudaMalloc(&d_queue_size, sizeof(int));
    cudaMalloc(&d_new_queue_size, sizeof(int));

    cudaMemcpy(d_adjacency_list, adjacency_list, num_nodes * MAX_EDGES_PER_NODE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_visited, visited, num_nodes * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_level, level, num_nodes * sizeof(int), cudaMemcpyHostToDevice);

    cudaMemcpy(d_queue, &source_node, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_queue_size, &queue_size, sizeof(int), cudaMemcpyHostToDevice);

    while (queue_size > 0) {
        BFS_kernel<<<(queue_size + 255) / 256, 256>>>(d_adjacency_list, d_visited, d_level, d_queue, d_queue_size, d_new_queue_size);
        cudaDeviceSynchronize();

        cudaMemcpy(&new_queue_size, d_new_queue_size, sizeof(int), cudaMemcpyDeviceToHost);
        cudaMemcpy(&queue_size, d_queue_size, sizeof(int), cudaMemcpyDeviceToHost);

        cudaMemcpy(d_queue_size, &new_queue_size, sizeof(int), cudaMemcpyHostToDevice);
        cudaMemcpy(d_new_queue_size, &new_queue_size, sizeof(int), cudaMemcpyHostToDevice);
    }

    cudaMemcpy(visited, d_visited, num_nodes * sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(level, d_level, num_nodes * sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(d_adjacency_list);
    cudaFree(d_visited);
    cudaFree(d_level);
    cudaFree(d_queue);
    cudaFree(d_queue_size);
    cudaFree(d_new_queue_size);
}

int main() {
    int adjacency_list[MAX_NODES * MAX_EDGES_PER_NODE];
    int visited[MAX_NODES] = {0};
    int level[MAX_NODES] = {0};
    int queue[MAX_NODES];
    int source_node = 0;
    int num_nodes, num_edges;

    // Populate adjacency list, num_nodes, and num_edges

    // Perform BFS
    BFS_parallel(adjacency_list, visited, level, queue, source_node, num_nodes);

    // Print results or do further processing

    return 0;
}


Writing bfs.cu


In [None]:
!nvcc bfs.cu -o bfs
!./bfs,

      BFS_parallel(adjacency_list, visited, level, queue, source_node, num_nodes);
                                                                       ^


      BFS_parallel(adjacency_list, visited, level, queue, source_node, num_nodes);
                                                                       ^


      int num_nodes, num_edges;
                     ^



In [None]:
%%writefile bfs.cu
#include <iostream>
#include <vector>
#include <queue>
#include <cuda_runtime.h>

using namespace std;

__global__ void bfs_kernel(int* adj_list, bool* visited, int* queue, int* queue_size, int num_vertices) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < *queue_size) {
        int curr_vertex = queue[tid];
        for (int i = adj_list[curr_vertex]; i < adj_list[curr_vertex + 1]; ++i) {
            int neighbour = adj_list[num_vertices + i];
            if (!visited[neighbour]) {
                visited[neighbour] = true;
                int index = atomicAdd(queue_size, 1);
                queue[index] = neighbour;
            }
        }
    }
}

int main() {
    int num_vertices, num_edges, source;
    cin >> num_vertices >> num_edges >> source;

    // Allocate memory for adjacency list and visited array on host
    vector<int> h_adj_list(num_vertices + 2); // Prefix sum for indices
    vector<int> h_edges(num_edges);
    vector<bool> h_visited(num_vertices + 1, false);

    // Input edges and build adjacency list
    for (int i = 0; i < num_edges; ++i) {
        int u, v;
        cin >> u >> v;
        h_edges[i] = v;
        ++h_adj_list[u + 1];
    }

    // Calculate prefix sum for indices
    for (int i = 2; i <= num_vertices + 1; ++i) {
        h_adj_list[i] += h_adj_list[i - 1];
    }

    // Copy adjacency list to GPU
    int* d_adj_list;
    cudaMalloc(&d_adj_list, (num_vertices + 2) * sizeof(int));
    cudaMemcpy(d_adj_list, h_adj_list.data(), (num_vertices + 2) * sizeof(int), cudaMemcpyHostToDevice);

    // Copy edges to GPU
    int* d_edges;
    cudaMalloc(&d_edges, num_edges * sizeof(int));
    cudaMemcpy(d_edges, h_edges.data(), num_edges * sizeof(int), cudaMemcpyHostToDevice);

    // Allocate memory for visited array and queue on GPU
    bool* d_visited;
    cudaMalloc(&d_visited, (num_vertices + 1) * sizeof(bool));
    cudaMemcpy(d_visited, h_visited.data(), (num_vertices + 1) * sizeof(bool), cudaMemcpyHostToDevice);

    int* d_queue;
    cudaMalloc(&d_queue, (num_vertices + 1) * sizeof(int));

    // Initialize queue with source vertex
    cudaMemcpy(d_queue, &source, sizeof(int), cudaMemcpyHostToDevice);
    int queue_size = 1;
    int* d_queue_size;
    cudaMalloc(&d_queue_size, sizeof(int));
    cudaMemcpy(d_queue_size, &queue_size, sizeof(int), cudaMemcpyHostToDevice);

    // Perform BFS on GPU
    while (queue_size > 0) {
        bfs_kernel<<<(queue_size + 255) / 256, 256>>>(d_adj_list, d_visited, d_queue, d_queue_size, num_vertices);
        cudaDeviceSynchronize();
        cudaMemcpy(&queue_size, d_queue_size, sizeof(int), cudaMemcpyDeviceToHost);
    }

    // Copy visited array from GPU to host and print visited vertices
    cudaMemcpy(h_visited.data(), d_visited, (num_vertices + 1) * sizeof(bool), cudaMemcpyDeviceToHost);
    cout << "Visited vertices: ";
    for (int i = 1; i <= num_vertices; ++i) {
        if (h_visited[i]) {
            cout << i << " ";
        }
    }
    cout << endl;

    // Free device memory
    cudaFree(d_adj_list);
    cudaFree(d_edges);
    cudaFree(d_visited);
    cudaFree(d_queue);
    cudaFree(d_queue_size);

    return 0;
}


In [None]:
!nvcc bfs.cu -o bfs
!./bfs

[01m[0m[01mbfs.cu(58)[0m: [01;31merror[0m: argument of type "void" is incompatible with parameter of type "const void *"
      cudaMemcpy(d_visited, h_visited.data(), (num_vertices + 1) * sizeof(bool), cudaMemcpyHostToDevice);
                            ^

[01m[0m[01mbfs.cu(78)[0m: [01;31merror[0m: argument of type "void" is incompatible with parameter of type "void *"
      cudaMemcpy(h_visited.data(), d_visited, (num_vertices + 1) * sizeof(bool), cudaMemcpyDeviceToHost);
                 ^

2 errors detected in the compilation of "bfs.cu".
/bin/bash: line 1: ./bfs: No such file or directory


In [None]:
%%writefile .cu
#include <iostream>
#include <omp.h>

void bubbleSort(int arr[], int n, int num_procs) {
    bool swapped = false;
    for (int i = 0; i < n - 1; ++i) {
        swapped = false;
        #pragma omp parallel for shared(arr, swapped) num_threads(num_procs)
        for (int j = 0; j < n - i - 1; ++j) {
            if (arr[j] > arr[j + 1]) {
                std::swap(arr[j], arr[j + 1]);
                swapped = true;
                #pragma omp critical
                {
                    std::cout << "Processor " << omp_get_thread_num() << " swapped elements at indices " << j << " and " << j + 1 << std::endl;
                }
            }
        }
        // Ensure synchronization between processors
        #pragma omp barrier
        if (!swapped) {
            break;
        }
    }
}

void sequentialBubbleSort(int arr[], int n) {
    bool swapped = false;
    for (int i = 0; i < n - 1; ++i) {
        swapped = false;
        for (int j = 0; j < n - i - 1; ++j) {
            if (arr[j] > arr[j + 1]) {
                std::swap(arr[j], arr[j + 1]);
                swapped = true;
            }
        }
        if (!swapped) {
            break;
        }
    }

}    // Sequential bubble sort


int main() {
    double start_time, end_time;
    int num_procs;
    std::cout << "Enter the number of processors: ";
    std::cin >> num_procs;

    int arr[] = {64, 34, 25, 12, 22, 11, 90};
    int n = sizeof(arr) / sizeof(arr[0]);
    start_time = omp_get_wtime();
    bubbleSort(arr, n, num_procs);
    end_time = omp_get_wtime();
    std::cout << "Parallel Bubble Sort took : " << end_time - start_time << " seconds.\n";

    int sequential_arr[n]={64, 34, 25, 12, 22, 11, 90};
    start_time = omp_get_wtime();
    sequentialBubbleSort(sequential_arr, n);
    end_time = omp_get_wtime();
    std::cout << "sBubble Sort took : " << end_time - start_time << " seconds.\n";

    std::cout << "Sorted array: \n";
    for (int i = 0; i < n; ++i) {
        std::cout << arr[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}