In [3]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter



Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to c:\users\a1742\appdata\local\temp\pip-req-build-u2s49egt
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter


  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git 'C:\Users\A1742\AppData\Local\Temp\pip-req-build-u2s49egt'

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\A1742\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [4]:
# from google.colab import drive
# drive.mount('/content/drive/', force_remount=True)



In [12]:
%%cuda_group_save -g "parseImage" -n "main.cu"

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <cstring>
#include <cuda_runtime.h>
#include <algorithm>


#define THREADS 32


#define IMAGESIZE 784  // Define before using IMAGESIZE

struct TrainingSample {
    int label;                  // Image label (0-9)
    float image[IMAGESIZE];    // Flattened image data
};

// CUDA kernel to compute Euclidean distances
__global__ void computeEuclideanDistances(const float* d_images, const float* d_testImage, float* d_distances, int num_samples) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num_samples) {
        float sum = 0.0f;
        for (int i = 0; i < IMAGESIZE; ++i) {
            float diff = d_images[idx * IMAGESIZE + i] - d_testImage[i];
            sum += diff * diff;
        }
        d_distances[idx] = sqrtf(sum);
    }
}

bool loadData(const std::string& filename, std::vector<TrainingSample>& samples) {
    std::ifstream file(filename, std::ios::binary);
    if (!file) {
        std::cerr << "Unable to open file " << filename << std::endl;
        return false;
    }

    // Read number of samples
    int num_samples = 0;
    file.read(reinterpret_cast<char*>(&num_samples), sizeof(int));
    if (!file) {
        std::cerr << "Error reading number of samples." << std::endl;
        return false;
    }

    samples.resize(num_samples);

    // Read each sample
    for (int i = 0; i < num_samples; ++i) {
        // Read label
        file.read(reinterpret_cast<char*>(&samples[i].label), sizeof(int));
        if (!file) {
            std::cerr << "Error reading label for sample " << i << std::endl;
            return false;
        }

        // Read image data
        file.read(reinterpret_cast<char*>(samples[i].image), sizeof(float) * IMAGESIZE);
        if (!file) {
            std::cerr << "Error reading image data for sample " << i << std::endl;
            return false;
        }
    }

    file.close();
    return true;
}




int main() {
    std::vector<TrainingSample> train_samples;
    std::vector<TrainingSample> test_samples;

    // Load training data
    if (!loadData("./train_mnist.bin", train_samples)) {
        return -1;  // Exit if data loading fails
    }

    std::cout << "Successfully loaded " << train_samples.size() << " training samples." << std::endl;

    // Load testing data
    if (!loadData("./test_mnist.bin", test_samples)) {
        return -1;  // Exit if data loading fails
    }

    std::cout << "Successfully loaded " << test_samples.size() << " testing samples." << std::endl;

    int num_trainsamples = train_samples.size();
    int num_testsamples = test_samples.size();

    // Allocate host arrays for training data
    int* h_train_labels = new int[num_trainsamples];
    float* h_train_images = new float[num_trainsamples * IMAGESIZE];

    // Transfer data from train_samples vector to host arrays
    for (int i = 0; i < num_trainsamples; ++i) {
        h_train_labels[i] = train_samples[i].label;
        std::memcpy(&h_train_images[i * IMAGESIZE], train_samples[i].image, sizeof(float) * IMAGESIZE);
    }

    // Device pointers for training data
    int* d_train_labels = nullptr;
    float* d_train_images = nullptr;

    // Allocate device memory for training data
    cudaError_t err;

    // Allocate memory for training labels
    err = cudaMalloc((void**)&d_train_labels, num_trainsamples * sizeof(int));
    if (err != cudaSuccess) {
        std::cerr << "Failed to allocate device memory for training labels: " << cudaGetErrorString(err) << std::endl;
        delete[] h_train_labels;
        delete[] h_train_images;
        return -1;
    }

    // Allocate memory for training images
    err = cudaMalloc((void**)&d_train_images, num_trainsamples * IMAGESIZE * sizeof(float));
    if (err != cudaSuccess) {
        std::cerr << "Failed to allocate device memory for training images: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_train_labels);
        delete[] h_train_labels;
        delete[] h_train_images;
        return -1;
    }

    // Copy training labels to device
    err = cudaMemcpy(d_train_labels, h_train_labels, num_trainsamples * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        std::cerr << "Failed to copy training labels to device: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_train_labels);
        cudaFree(d_train_images);
        delete[] h_train_labels;
        delete[] h_train_images;
        return -1;
    }

    // Copy training images to device
    err = cudaMemcpy(d_train_images, h_train_images, num_trainsamples * IMAGESIZE * sizeof(float), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        std::cerr << "Failed to copy training images to device: " << cudaGetErrorString(err) << std::endl;
        cudaFree(d_train_labels);
        cudaFree(d_train_images);
        delete[] h_train_labels;
        delete[] h_train_images;
        return -1;
    }

    // Free host memory for training data
    delete[] h_train_labels;
    delete[] h_train_images;

    // Now process the test samples
    int k = 10; // Number of nearest neighbors
    int correct_predictions = 0;

    for (int t = 0; t < num_testsamples; ++t) {
        int test_label = test_samples[t].label;
        float* h_test_image = test_samples[t].image;

        // Allocate device memory for test image
        float* d_test_image = nullptr;
        err = cudaMalloc((void**)&d_test_image, IMAGESIZE * sizeof(float));
        if (err != cudaSuccess) {
            std::cerr << "Failed to allocate device memory for test image: " << cudaGetErrorString(err) << std::endl;
            // Free previously allocated device memory
            cudaFree(d_train_labels);
            cudaFree(d_train_images);
            return -1;
        }

        // Copy test image to device
        err = cudaMemcpy(d_test_image, h_test_image, IMAGESIZE * sizeof(float), cudaMemcpyHostToDevice);
        if (err != cudaSuccess) {
            std::cerr << "Failed to copy test image to device: " << cudaGetErrorString(err) << std::endl;
            cudaFree(d_test_image);
            cudaFree(d_train_labels);
            cudaFree(d_train_images);
            return -1;
        }

        // Allocate device memory for distances
        float* d_distances = nullptr;
        err = cudaMalloc((void**)&d_distances, num_trainsamples * sizeof(float));
        if (err != cudaSuccess) {
            std::cerr << "Failed to allocate device memory for distances: " << cudaGetErrorString(err) << std::endl;
            cudaFree(d_test_image);
            cudaFree(d_train_labels);
            cudaFree(d_train_images);
            return -1;
        }

        // Define grid and block dimensions
        int threadsPerBlock = 256;
        int blocksPerGrid = (num_trainsamples + threadsPerBlock - 1) / threadsPerBlock;

        // Launch the kernel to compute Euclidean distances
        computeEuclideanDistances<<<blocksPerGrid, threadsPerBlock>>>(d_train_images, d_test_image, d_distances, num_trainsamples);

        // Check for kernel launch errors
        err = cudaGetLastError();
        if (err != cudaSuccess) {
            std::cerr << "Kernel launch failed: " << cudaGetErrorString(err) << std::endl;
            cudaFree(d_distances);
            cudaFree(d_test_image);
            cudaFree(d_train_labels);
            cudaFree(d_train_images);
            return -1;
        }

        // Allocate host memory for distances
        float* h_distances = new float[num_trainsamples];

        // Copy distances back to host
        err = cudaMemcpy(h_distances, d_distances, num_trainsamples * sizeof(float), cudaMemcpyDeviceToHost);
        if (err != cudaSuccess) {
            std::cerr << "Failed to copy distances to host: " << cudaGetErrorString(err) << std::endl;
            delete[] h_distances;
            cudaFree(d_distances);
            cudaFree(d_test_image);
            cudaFree(d_train_labels);
            cudaFree(d_train_images);
            return -1;
        }

        // Now perform k-NN classification on the host
        std::vector<std::pair<float, int>> distanceLabelPairs(num_trainsamples);

        for (int i = 0; i < num_trainsamples; ++i) {
            distanceLabelPairs[i] = std::make_pair(h_distances[i], train_samples[i].label);
        }

        // Sort the distances
        std::sort(distanceLabelPairs.begin(), distanceLabelPairs.end());

        // Count the frequency of labels among the k nearest neighbors
        std::vector<int> labelCount(10, 0);  // Assuming labels are digits 0-9

        for (int i = 0; i < k; ++i) {
            int neighborLabel = distanceLabelPairs[i].second;
            labelCount[neighborLabel]++;
        }

        // Find the label with the maximum frequency
        int predictedLabel = std::distance(labelCount.begin(), std::max_element(labelCount.begin(), labelCount.end()));

        // Check if prediction is correct
        if (predictedLabel == test_label) {
            correct_predictions++;
        }

        std::cout << "Test Sample " << t << ": Actual Label = " << test_label << ", Predicted Label = " << predictedLabel << std::endl;

        // Free memory for this test sample
        delete[] h_distances;
        cudaFree(d_distances);
        cudaFree(d_test_image);
    }

    // After processing all test samples
    float accuracy = (float)correct_predictions / num_testsamples * 100.0f;
    std::cout << "Accuracy: " << accuracy << "%" << std::endl;

    // Free device memory for training data
    cudaFree(d_train_labels);
    cudaFree(d_train_images);

    return 0;
}


In [13]:
%cuda_group_run --group "parseImage" --compiler-args "-O3 -g -std=c++20 -arch=sm_75"

Successfully loaded 1000 training samples.
Successfully loaded 10 testing samples.
Test Sample 0: Actual Label = 0, Predicted Label = 0
Test Sample 1: Actual Label = 7, Predicted Label = 7
Test Sample 2: Actual Label = 1, Predicted Label = 1
Test Sample 3: Actual Label = 1, Predicted Label = 1
Test Sample 4: Actual Label = 4, Predicted Label = 4
Test Sample 5: Actual Label = 9, Predicted Label = 9
Test Sample 6: Actual Label = 4, Predicted Label = 4
Test Sample 7: Actual Label = 3, Predicted Label = 3
Test Sample 8: Actual Label = 4, Predicted Label = 4
Test Sample 9: Actual Label = 8, Predicted Label = 8
Accuracy: 100%

