In [17]:
# Load the extension that allows us to compile CUDA code in python notebooks
# Documentation is here: https://nvcc4jupyter.readthedocs.io/en/latest/
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc4jupyter



The nvcc4jupyter extension is already loaded. To reload it, use:Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to c:\users\a1742\appdata\local\temp\pip-req-build-3mbbzg0h
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 28f872a2f99a1b201bcd0db14fdbc5a496b9bfd7
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'

  %reload_ext nvcc4jupyter


  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git 'C:\Users\A1742\AppData\Local\Temp\pip-req-build-3mbbzg0h'

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\A1742\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [44]:
%%cuda_group_save -g "knn" -n "main.cu"

// Required header files / 所需的头文件
#include <iostream>     // For input/output operations / 用于输入输出操作
#include <fstream>      // For file operations / 用于文件操作
#include <vector>       // For vector container / 用于向量容器
#include <string>       // For string operations / 用于字符串操作
#include <cstring>      // For C-style string operations / 用于C风格字符串操作
#include <algorithm>    // For algorithms like max_element / 用于算法如max_element
#include <cuda_runtime.h> // For CUDA operations / 用于CUDA操作

// Constants definition / 常量定义
#define THREADS 256        // Number of threads per block / 每个块的线程数
#define IMAGESIZE 784      // Image size (28x28 = 784 pixels) / 图像大小 (28x28 = 784像素)

// Function to handle big-endian to little-endian conversion
// 处理大端序转小端序的函数
uint32_t swap32(uint32_t val) {
    val = ((val << 8) & 0xFF00FF00) | ((val >> 8) & 0xFF00FF);
    return (val << 16) | (val >> 16);
}

// Structure to store training/testing samples
// 存储训练/测试样本的结构体
struct TrainingSample {
    int label;                  // The digit (0-9) / 数字标签 (0-9)
    float image[IMAGESIZE];     // Normalized pixel values / 归一化的像素值
};

// CUDA kernel for bitonic sort step
// 双调排序步骤的CUDA核函数
__global__ void bitonicSortStep(float* d_distances, int* d_labels, int step, int stage, int num_samples) {
    // Calculate global thread ID / 计算全局线程ID
    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= num_samples) return;  // Boundary check / 边界检查

    // Calculate partner index for comparison / 计算比较伙伴的索引
    unsigned int partner = i ^ stage;
    
    // Only perform swap if partner is valid / 仅在伙伴有效时执行交换
    if (partner > i && partner < num_samples) {
        if ((i & step) == 0) {  // Determine sort direction / 确定排序方向
            if (d_distances[i] > d_distances[partner]) {
                // Swap distances / 交换距离
                float temp_dist = d_distances[i];
                d_distances[i] = d_distances[partner];
                d_distances[partner] = temp_dist;
                
                // Swap corresponding labels / 交换对应的标签
                int temp_label = d_labels[i];
                d_labels[i] = d_labels[partner];
                d_labels[partner] = temp_label;
            }
        }
    }
}

// Host function to manage bitonic sort
// 管理双调排序的主机端函数
void bitonicSort(float* d_distances, int* d_labels, int num_samples) {
    // Calculate next power of 2 / 计算大于等于样本数的最小2的幂
    int pow2_size = 1;
    while (pow2_size < num_samples) pow2_size *= 2;
    
    // Set up grid and block dimensions / 设置网格和块的维度
    dim3 block(THREADS);
    dim3 grid((num_samples + block.x - 1) / block.x);

    // Main sorting loops / 主排序循环
    for (int step = 2; step <= pow2_size; step *= 2) {
        for (int stage = step/2; stage > 0; stage /= 2) {
            bitonicSortStep<<<grid, block>>>(d_distances, d_labels, step, stage, num_samples);
            cudaDeviceSynchronize();  // Wait for kernel completion / 等待内核完成
        }
    }
}

// CUDA kernel for computing Euclidean distances
// 计算欧几里得距离的CUDA核函数
__global__ void computeEuclideanDistances(float* d_images, float* d_testImage, 
                                        float* d_distances, int* d_labels,
                                        int* d_train_labels, int num_samples) {
    // Calculate global thread ID / 计算全局线程ID
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < num_samples) {
        float sum = 0.0f;
        // Calculate squared Euclidean distance / 计算欧几里得距离的平方
        for (int i = 0; i < IMAGESIZE; ++i) {
            float diff = d_images[idx * IMAGESIZE + i] - d_testImage[i];
            sum += diff * diff;
        }
        d_distances[idx] = sqrtf(sum);  // Store the distance / 存储距离
        d_labels[idx] = d_train_labels[idx];  // Store the label / 存储标签
    }
}

// Function to load MNIST dataset in IDX format
// 加载IDX格式MNIST数据集的函数
bool loadMNISTImages(const std::string& image_path, const std::string& label_path, 
                    std::vector<TrainingSample>& samples) {
    // Open image file / 打开图像文件
    std::ifstream image_file(image_path, std::ios::binary);
    if (!image_file) {
        std::cerr << "Cannot open image file: " << image_path << std::endl;
        return false;
    }

    // Open label file / 打开标签文件
    std::ifstream label_file(label_path, std::ios::binary);
    if (!label_file) {
        std::cerr << "Cannot open label file: " << label_path << std::endl;
        return false;
    }

    // Read image file header / 读取图像文件头
    uint32_t magic, num_items, num_rows, num_cols;
    image_file.read(reinterpret_cast<char*>(&magic), sizeof(magic));
    image_file.read(reinterpret_cast<char*>(&num_items), sizeof(num_items));
    image_file.read(reinterpret_cast<char*>(&num_rows), sizeof(num_rows));
    image_file.read(reinterpret_cast<char*>(&num_cols), sizeof(num_cols));

    // Convert from big-endian to host endian / 从大端序转换为主机字节序
    magic = swap32(magic);
    num_items = swap32(num_items);
    num_rows = swap32(num_rows);
    num_cols = swap32(num_cols);

    // Verify image file format / 验证图像文件格式
    if (magic != 0x803) {
        std::cerr << "Invalid image file format" << std::endl;
        return false;
    }

    // Read label file header / 读取标签文件头
    uint32_t label_magic, num_labels;
    label_file.read(reinterpret_cast<char*>(&label_magic), sizeof(label_magic));
    label_file.read(reinterpret_cast<char*>(&num_labels), sizeof(num_labels));

    // Convert label file header / 转换标签文件头
    label_magic = swap32(label_magic);
    num_labels = swap32(num_labels);

    // Verify label file format / 验证标签文件格式
    if (label_magic != 0x801) {
        std::cerr << "Invalid label file format" << std::endl;
        return false;
    }

    // Check consistency between images and labels / 检查图像和标签数量是否一致
    if (num_items != num_labels) {
        std::cerr << "Number of images doesn't match number of labels" << std::endl;
        return false;
    }

    // Prepare storage / 准备存储空间
    samples.resize(num_items);
    std::vector<unsigned char> pixels(num_rows * num_cols);

    // Read and process each sample / 读取并处理每个样本
    for (uint32_t i = 0; i < num_items; ++i) {
        // Read label / 读取标签
        unsigned char label;
        label_file.read(reinterpret_cast<char*>(&label), 1);
        samples[i].label = static_cast<int>(label);

        // Read image / 读取图像
        image_file.read(reinterpret_cast<char*>(pixels.data()), pixels.size());
        
        // Normalize pixel values to [0,1] / 将像素值归一化到[0,1]范围
        for (size_t j = 0; j < pixels.size(); ++j) {
            samples[i].image[j] = static_cast<float>(pixels[j]) / 255.0f;
        }

        // Show progress / 显示进度
        if (i % 1000 == 0) {
            std::cout << "\rLoading data: " << (i * 100.0f / num_items) << "%" << std::flush;
        }
    }
    std::cout << "\rLoading data: 100%" << std::endl;

    return true;
}

int main() {
    // Declare containers for samples / 声明样本容器
    std::vector<TrainingSample> train_samples;
    std::vector<TrainingSample> test_samples;

    // Load training data / 加载训练数据
    if (!loadMNISTImages("./train_mnist/MNIST/raw/train-images-idx3-ubyte",
                        "./train_mnist/MNIST/raw/train-labels-idx1-ubyte",
                        train_samples)) {
        return -1;
    }
    std::cout << "Successfully loaded " << train_samples.size() << " training samples." << std::endl;

    // Load testing data / 加载测试数据
    if (!loadMNISTImages("./test_mnist/MNIST/raw/t10k-images-idx3-ubyte",
                        "./test_mnist/MNIST/raw/t10k-labels-idx1-ubyte",
                        test_samples)) {
        return -1;
    }
    std::cout << "Successfully loaded " << test_samples.size() << " testing samples." << std::endl;

    int num_trainsamples = train_samples.size();
    int num_testsamples = test_samples.size();

    // Allocate host memory for training data / 为训练数据分配主机内存
    float* h_train_images = new float[num_trainsamples * IMAGESIZE];
    int* h_train_labels = new int[num_trainsamples];

    for (int i = 0; i < num_trainsamples; ++i) {
        h_train_labels[i] = train_samples[i].label;
        std::memcpy(&h_train_images[i * IMAGESIZE], train_samples[i].image, sizeof(float) * IMAGESIZE);
    }

    // Allocate GPU memory / 分配GPU内存
    float* d_train_images;
    int* d_train_labels;
    cudaMalloc(&d_train_images, num_trainsamples * IMAGESIZE * sizeof(float));
    cudaMalloc(&d_train_labels, num_trainsamples * sizeof(int));

    // Copy training data to GPU / 将训练数据复制到GPU
    cudaMemcpy(d_train_images, h_train_images, num_trainsamples * IMAGESIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_train_labels, h_train_labels, num_trainsamples * sizeof(int), cudaMemcpyHostToDevice);

    // KNN parameters / KNN参数
    int k = 10;  // Number of neighbors / 近邻数量
    int correct_predictions = 0;

    // Process each test sample / 处理每个测试样本
    for (int t = 0; t < num_testsamples; ++t) {
        // Show progress / 显示进度
        if (t % 100 == 0) {
            std::cout << "\rProcessing test samples: " << (t * 100.0f / num_testsamples) << "%" << std::flush;
        }

        int test_label = test_samples[t].label;
        
        // Allocate and copy test image to GPU / 分配并复制测试图像到GPU
        float* d_test_image;
        cudaMalloc(&d_test_image, IMAGESIZE * sizeof(float));
        cudaMemcpy(d_test_image, test_samples[t].image, IMAGESIZE * sizeof(float), cudaMemcpyHostToDevice);

        // Allocate memory for distances and labels / 为距离和标签分配内存
        float* d_distances;
        int* d_sort_labels;
        cudaMalloc(&d_distances, num_trainsamples * sizeof(float));
        cudaMalloc(&d_sort_labels, num_trainsamples * sizeof(int));

        // Configure kernel parameters / 配置内核参数
        int threadsPerBlock = 256;
        int blocksPerGrid = (num_trainsamples + threadsPerBlock - 1) / threadsPerBlock;
        
        // Compute distances / 计算距离
        computeEuclideanDistances<<<blocksPerGrid, threadsPerBlock>>>(
            d_train_images, d_test_image, d_distances, d_sort_labels, d_train_labels, num_trainsamples
        );

        // Sort distances / 排序距离
        bitonicSort(d_distances, d_sort_labels, num_trainsamples);

        // Allocate and copy k-nearest results / 分配并复制k个最近邻结果
        float* h_distances = new float[k];
        int* h_labels = new int[k];
        cudaMemcpy(h_distances, d_distances, k * sizeof(float), cudaMemcpyDeviceToHost);
        cudaMemcpy(h_labels, d_sort_labels, k * sizeof(int), cudaMemcpyDeviceToHost);

        // Count label frequencies / 统计标签频率
        std::vector<int> labelCount(10, 0);
        for (int i = 0; i < k; ++i) {
            labelCount[h_labels[i]]++;
        }

        // Find most common label / 找出最常见的标签
        int predictedLabel = std::distance(labelCount.begin(), 
                                         std::max_element(labelCount.begin(), labelCount.end()));

        // Update accuracy counter / 更新正确计数
        if (predictedLabel == test_label) {
            correct_predictions++;
        }

        // Print periodic updates / 定期打印更新
        if (t % 1000 == 0) {
            std::cout << "\nTest Sample " << t << ": Actual = " << test_label 
                     << ", Predicted = " << predictedLabel << std::endl;
        }

        // Free temporary memory / 释放临时内存
        delete[] h_distances;
        delete[] h_labels;
        cudaFree(d_distances);
        cudaFree(d_sort_labels);
        cudaFree(d_test_image);
    }
    
    // Print final progress / 打印最终进度
    std::cout << "\rProcessing test samples: 100%" << std::endl;

    // Calculate and display final results / 计算并显示最终结果
    float accuracy = (float)correct_predictions / num_testsamples * 100.0f;
    std::cout << "\nFinal Results:" << std::endl;
    std::cout << "Total test samples: " << num_testsamples << std::endl;
    std::cout << "Correct predictions: " << correct_predictions << std::endl;
    std::cout << "Accuracy: " << accuracy << "%" << std::endl;

    // Free all allocated memory / 释放所有分配的内存
    delete[] h_train_images;
    delete[] h_train_labels;
    cudaFree(d_train_images);
    cudaFree(d_train_labels);

    return 0;
}

In [45]:
%cuda_group_run --group "knn" --compiler-args "-O3 -g -std=c++20 -arch=sm_75"

Loading data: 100%333%
Successfully loaded 60000 training samples.
Loading data: 100%
Successfully loaded 10000 testing samples.
Processing test samples: 0%
Test Sample 0: Actual = 7, Predicted = 7
Processing test samples: 10%
Test Sample 1000: Actual = 9, Predicted = 9
Processing test samples: 20%
Test Sample 2000: Actual = 6, Predicted = 6
Processing test samples: 30%
Test Sample 3000: Actual = 6, Predicted = 6
Processing test samples: 40%
Test Sample 4000: Actual = 9, Predicted = 9
Processing test samples: 50%
Test Sample 5000: Actual = 3, Predicted = 3
Processing test samples: 60%
Test Sample 6000: Actual = 9, Predicted = 9
Processing test samples: 70%
Test Sample 7000: Actual = 1, Predicted = 1
Processing test samples: 80%
Test Sample 8000: Actual = 4, Predicted = 4
Processing test samples: 90%
Test Sample 9000: Actual = 7, Predicted = 7
Processing test samples: 100%

Final Results:
Total test samples: 10000
Correct predictions: 9031
Accuracy: 90.31%

