In [1]:
TEST_DATA = ("Hello, World!"*10).encode('utf-8')
TEST_HASH = "ff812bf49826a5014695180486a8574ad40cb1de6ccc34effadbb895fb829451"

In [2]:
import hashlib

def sha256d(input_data: bytes) -> str:
    first_hash = hashlib.sha256(input_data).digest()
    
    second_hash = hashlib.sha256(first_hash).digest()
    
    return second_hash.hex()

if __name__ == "__main__":    
    hash_result = sha256d(TEST_DATA)
    
    print(f"SHA-256d correct: {TEST_HASH == hash_result}")
    print(hash_result)

SHA-256d correct: True
ff812bf49826a5014695180486a8574ad40cb1de6ccc34effadbb895fb829451


# 2 Cuda

In [3]:
# Define the CUDA kernel code as a string
cuda_code = """
#include <stdint.h>

__device__ __constant__ uint32_t k[64] = {
    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};

__device__ uint32_t rotr(uint32_t x, uint32_t n) {
    return (x >> n) | (x << (32 - n));
}

__device__ void sha256_transform(uint32_t state[8], const uint8_t data[64]) {
    uint32_t a, b, c, d, e, f, g, h, i, j, t1, t2, m[64];

    #pragma unroll
    for (i = 0, j = 0; i < 16; ++i, j += 4)
        m[i] = (data[j] << 24) | (data[j + 1] << 16) | (data[j + 2] << 8) | (data[j + 3]);
    #pragma unroll
    for ( ; i < 64; ++i)
        m[i] = rotr(m[i - 2], 17) ^ rotr(m[i - 2], 19) ^ (m[i - 2] >> 10) + m[i - 7] + rotr(m[i - 15], 7) ^ rotr(m[i - 15], 18) ^ (m[i - 15] >> 3) + m[i - 16];

    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    f = state[5];
    g = state[6];
    h = state[7];

    #pragma unroll
    for (i = 0; i < 64; ++i) {
        t1 = h + (rotr(e, 6) ^ rotr(e, 11) ^ rotr(e, 25)) + ((e & f) ^ (~e & g)) + k[i] + m[i];
        t2 = (rotr(a, 2) ^ rotr(a, 13) ^ rotr(a, 22)) + ((a & b) ^ (a & c) ^ (b & c));
        h = g;
        g = f;
        f = e;
        e = d + t1;
        d = c;
        c = b;
        b = a;
        a = t1 + t2;
    }

    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    state[5] += f;
    state[6] += g;
    state[7] += h;
}

extern "C" {
    __global__ void sha256d_kernel(uint8_t *input, uint8_t *output, int length) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        if (idx < length) {
            uint32_t state[8] = {
                0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
                0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
            };
            uint8_t hash1[64] = {0};
            uint8_t hash2[64] = {0};

            sha256_transform(state, input + idx * 64);
            for (int i = 0; i < 8; ++i) {
                hash1[i * 4] = (state[i] >> 24) & 0xff;
                hash1[i * 4 + 1] = (state[i] >> 16) & 0xff;
                hash1[i * 4 + 2] = (state[i] >> 8) & 0xff;
                hash1[i * 4 + 3] = state[i] & 0xff;
            }

            state[0] = 0x6a09e667;
            state[1] = 0xbb67ae85;
            state[2] = 0x3c6ef372;
            state[3] = 0xa54ff53a;
            state[4] = 0x510e527f;
            state[5] = 0x9b05688c;
            state[6] = 0x1f83d9ab;
            state[7] = 0x5be0cd19;

            sha256_transform(state, hash1);
            for (int i = 0; i < 8; ++i) {
                hash2[i * 4] = (state[i] >> 24) & 0xff;
                hash2[i * 4 + 1] = (state[i] >> 16) & 0xff;
                hash2[i * 4 + 2] = (state[i] >> 8) & 0xff;
                hash2[i * 4 + 3] = state[i] & 0xff;
            }

            for (int i = 0; i < 32; i++) {
                output[idx * 32 + i] = hash2[i];
            }
        }
    }
}
"""

# Write the CUDA code to a file
with open('sha256_kernel.cu', 'w') as file:
    file.write(cuda_code)

print("CUDA kernel code has been written to sha256_kernel.cu")

CUDA kernel code has been written to sha256_kernel.cu


In [4]:
# import pycuda.driver as cuda
# import pycuda.autoinit
# from pycuda.compiler import SourceModule
# import numpy as np
# import struct

# # Load and compile the CUDA kernel
# with open('sha256_kernel.cu', 'r') as f:
#     kernel_code = f.read()

# mod = SourceModule(kernel_code)
# sha256d_kernel = mod.get_function("sha256d_kernel")

# # Example input data
# input_data = TEST_DATA
# # input_data = b"Hello, World!"

# original_bit_length = len(input_data) * 8

# # Padding the input data to ensure its length is a multiple of 64 bytes
# pad_len = 64 - ((len(input_data) + 9) % 64)
# padded_input_data = input_data + b'\x80' + b'\x00' * pad_len + struct.pack('>Q', original_bit_length)

# # Prepare input and output buffers
# input_buffer = np.frombuffer(padded_input_data, dtype=np.uint8)
# output_buffer = np.zeros((len(input_buffer) // 64, 32), dtype=np.uint8)

# # Allocate device memory
# input_gpu = cuda.mem_alloc(input_buffer.nbytes)
# output_gpu = cuda.mem_alloc(output_buffer.nbytes)


# # input_buffer = np.frombuffer(padded_input_data, dtype=np.uint8)
# # output_buffer = np.zeros((len(input_buffer) // 64, 32), dtype=np.uint8)
# # input_buffer = np.frombuffer(input_data, dtype=np.uint8)
# # output_buffer = np.zeros((len(input_buffer) // 64, 32), dtype=np.uint8)
# # input_gpu = cuda.mem_alloc(input_buffer.nbytes)
# # output_gpu = cuda.mem_alloc(output_buffer.nbytes)

# # Copy data to device
# cuda.memcpy_htod(input_gpu, input_buffer)

# # Define block and grid sizes
# block_size = 256
# grid_size = (len(input_buffer) // 64 + block_size - 1) // block_size

# # Run the CUDA kernel
# sha256d_kernel(input_gpu, output_gpu, np.int32(len(input_buffer) // 64), block=(block_size, 1, 1), grid=(grid_size, 1))

# # Copy the result back to host
# cuda.memcpy_dtoh(output_buffer, output_gpu)

# # Print the SHA-256d hash result
# print("SHA-256d hash:", output_buffer.tobytes().hex())
# print(f"SHA-256d correct: {TEST_HASH} {output_buffer.tobytes().hex()}")

In [5]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import struct
import hashlib

# Load and compile the CUDA kernel
with open('sha256_kernel.cu', 'r') as f:
    kernel_code = f.read()

mod = SourceModule(kernel_code)
sha256d_kernel = mod.get_function("sha256d_kernel")

# Example input data
input_data = b"Hello, World!"

# Calculate the original length in bits
original_bit_length = len(input_data) * 8

# Padding the input data to ensure its length is a multiple of 64 bytes
pad_len = 64 - ((len(input_data) + 9) % 64)
padded_input_data = input_data + b'\x80' + b'\x00' * pad_len + struct.pack('>Q', original_bit_length)

# Prepare input and output buffers
input_buffer = np.frombuffer(padded_input_data, dtype=np.uint8)
output_buffer = np.zeros((len(input_buffer) // 64, 32), dtype=np.uint8)

# Allocate device memory
input_gpu = cuda.mem_alloc(input_buffer.nbytes)
output_gpu = cuda.mem_alloc(output_buffer.nbytes)

# Copy data to device
cuda.memcpy_htod(input_gpu, input_buffer)

# Define block and grid sizes
block_size = 256
grid_size = (len(input_buffer) // 64 + block_size - 1) // block_size

# Run the CUDA kernel
sha256d_kernel(input_gpu, output_gpu, np.int32(len(input_buffer) // 64), block=(block_size, 1, 1), grid=(grid_size, 1))

# Copy the result back to host
cuda.memcpy_dtoh(output_buffer, output_gpu)

# Print the SHA-256d hash result
cuda_result = output_buffer.tobytes().hex()
print("SHA-256d hash (CUDA):", cuda_result)

# Calculate SHA-256d using hashlib for verification
first_hash = hashlib.sha256(input_data).digest()
second_hash = hashlib.sha256(first_hash).hexdigest()

print(f"SHA-256d correct (hashlib): {second_hash}")
print(f"SHA-256d match: {cuda_result == second_hash}")

SHA-256d hash (CUDA): f5bab94b3a69b7c7464521353ce33035f8f12e62bdf1ffdcbb7db0ba1adbaa81
SHA-256d correct (hashlib): 042a7d64a581ef2ee983f21058801cc35663b705e6c55f62fa8e0f18ecc70989
SHA-256d match: False


# sha256 -> sha256d

In [21]:
# cuda_code = """

# """

# # Write the CUDA code to a file
# with open('sha256_kernel_v2.cu', 'w') as file:
#     file.write(cuda_code)

# print("CUDA kernel code has been written to sha256_kernel.cu")

In [24]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import struct
import hashlib

# Load and compile the CUDA kernel
with open('sha256_kernel_v2.cu', 'r') as f:
    kernel_code = f.read()

mod = SourceModule(kernel_code)
sha256d_kernel = mod.get_function("sha256d_kernel")

input_data = b"abc"

original_bit_length = len(input_data) * 8
pad_len = 64 - ((len(input_data) + 9) % 64)
padded_input_data = input_data + b'\x80' + b'\x00' * pad_len + struct.pack('>Q', original_bit_length)

input_values = [struct.unpack('>I', padded_input_data[i:i+4])[0] for i in range(0, len(padded_input_data), 4)]

assert len(input_values) == 16

output_buffer = np.zeros(8, dtype=np.uint32)  # SHA-256 produces 8 32-bit words


output_gpu = cuda.mem_alloc(output_buffer.nbytes)
input_values_np = np.array(input_values, dtype=np.uint32)

block_size = (1, 1, 1)
grid_size = (1, 1, 1)

sha256d_kernel(
    output_gpu,
    *input_values_np,
    block=block_size,
    grid=grid_size
)

cuda.memcpy_dtoh(output_buffer, output_gpu)

cuda_result = ''.join([f'{x:08x}' for x in output_buffer])
first_hash = hashlib.sha256(input_data).hexdigest()
print("SHA-256 hash (CUDA):", cuda_result)
print(f"SHA-256 correct (hashlib): {first_hash}")
print(f"SHA-256 match: {cuda_result == first_hash}")

first_hash_bytes = b''.join(struct.pack('>I', word) for word in output_buffer)
second_input_data = first_hash_bytes + b'\x80' + b'\x00' * (55 - len(first_hash_bytes)) + struct.pack('>Q', 256)

second_input_values = [struct.unpack('>I', second_input_data[i:i+4])[0] for i in range(0, len(second_input_data), 4)]

assert len(second_input_values) == 16

second_input_values_np = np.array(second_input_values, dtype=np.uint32)

sha256d_kernel(
    output_gpu,
    *second_input_values_np,
    block=block_size,
    grid=grid_size
)

cuda.memcpy_dtoh(output_buffer, output_gpu)

cuda_result = ''.join([f'{x:08x}' for x in output_buffer])
print("SHA-256d hash (CUDA):", cuda_result)

first_hash = hashlib.sha256(input_data).digest()
second_hash = hashlib.sha256(first_hash).hexdigest()

print(f"SHA-256d correct (hashlib): {second_hash}")
print(f"SHA-256d match: {cuda_result == second_hash}")

sha256 1 values: c0 1633837952 -> hout0 3128432319
sha256 1 values: c1 0 -> hout1 2399260650
sha256 1 values: c2 0 -> hout2 1094795486
sha256 1 values: c3 0 -> hout3SHA-256 hash (CUDA): ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad
SHA-256 correct (hashlib): ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad
SHA-256 match: True
 1571693091
sha256 1 values: c4 0 -> hout4 2953011619
sha256 1 values: c5 0 -> hout5 2518121116
sha256 1 values: c6 0 -> hout6 3021012833
sha256 1 values: c7 0 -> hout7 4060091821
sha256 1 values: c8 0
sha256 1 values: c9 0
sha256 1 values: c10 0
sha256 1 values: c11 0
sha256 1 values: c12 0
sha256 1 values: c13 0
sha256 1 values: c14 0
sha256 1 values: c15 24
sha256 1 values: c0 3128432319 -> hout0 1334526658
sha256 1 values: c1 2399260650 -> hout1 768832155
sha256 1 values: c2 1094795486 -> hout2 1369155318
sha256 1 values: c3 1571693091 -> hout3 2368579532
sha256 1 values: c4 2953011619 -> hout4 1529700461
sha256 1 values: c5 

In [27]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import struct
import hashlib

with open('sha256_kernel_v2.cu', 'r') as f:
    kernel_code = f.read()

mod = SourceModule(kernel_code)
sha256d_kernel = mod.get_function("sha256d_kernel")

input_data = b"abc"
original_bit_length = len(input_data) * 8
# pad_len = 64 - ((len(input_data) + 9) % 64)
pad_len = 64 - ((len(input_data) + 9) % 64)
padded_input_data = input_data + b'\x80' + b'\x00' * pad_len + struct.pack('>Q', original_bit_length)

input_values = [struct.unpack('>I', padded_input_data[i:i+4])[0] for i in range(0, len(padded_input_data), 4)]

assert len(input_values) == 16

output_buffer = np.zeros(8, dtype=np.uint32)  # SHA-256 produces 8 32-bit words

output_gpu = cuda.mem_alloc(output_buffer.nbytes)
input_values_np = np.array(input_values, dtype=np.uint32)
block_size = (1, 1, 1)
grid_size = (1, 1, 1)

sha256d_kernel(
    output_gpu,
    *input_values_np,
    block=block_size,
    grid=grid_size
)

cuda.memcpy_dtoh(output_buffer, output_gpu)

cuda_result = ''.join([f'{x:08x}' for x in output_buffer])
print("SHA-256d hash (CUDA):", cuda_result)

first_hash = hashlib.sha256(input_data).digest()
second_hash = hashlib.sha256(first_hash).hexdigest()

print(f"SHA-256d correct (hashlib): {second_hash}")
print(f"SHA-256d match: {cuda_result == second_hash}")

error: unpack requires a buffer of 4 bytes