From db273f267142b062442c034ea367fd81901ba926 Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 22 Apr 2023 19:14:46 +0200 Subject: [PATCH 01/13] introduce RandomX v2 with double-hashing --- src/randomx.cpp | 11 +++++++++++ src/randomx.h | 11 +++++++++++ src/tests/benchmark.cpp | 32 +++++++++++++++++++++++++------- src/tests/tests.cpp | 24 ++++++++++++++++++++++++ vcxproj/randomx.vcxproj | 2 +- vcxproj/randomx.vcxproj.filters | 2 +- 6 files changed, 73 insertions(+), 9 deletions(-) diff --git a/src/randomx.cpp b/src/randomx.cpp index 7daaa46d..537dde2e 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -400,4 +400,15 @@ extern "C" { machine->run(machine->tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); } + + void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out) { + assert(inputSize == 0 || input != nullptr); + assert(v1_in != nullptr); + assert(v2_out != nullptr); + blake2b_state state; + blake2b_init(&state, RANDOMX_HASH_SIZE); + blake2b_update(&state, input, inputSize); + blake2b_update(&state, v1_in, RANDOMX_HASH_SIZE); + blake2b_final(&state, v2_out, RANDOMX_HASH_SIZE); + } } diff --git a/src/randomx.h b/src/randomx.h index 64d18068..d7e2d998 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -260,6 +260,17 @@ RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output); RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output); +/** + * Calculate V2 hash from the V1 hash and its input. + * + * @param input is a pointer to memory that was hashed by V1. Must not be NULL. + * @param inputSize is the number of bytes in the input. + * @param v1_in is the V1 hash (RANDOMX_HASH_SIZE bytes). + * @param output is a pointer to memory where the V2 hash will be stored. Must not + * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. +*/ +RANDOMX_EXPORT void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out); + #if defined(__cplusplus) } #endif diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index 36b0259b..df371682 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -96,6 +96,7 @@ void printUsage(const char* executable) { std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; + std::cout << " --v2 calculate v2 hashes (default: v1)" << std::endl; } struct MemoryException : public std::exception { @@ -113,7 +114,7 @@ struct DatasetAllocException : public MemoryException { using MineFunc = void(randomx_vm * vm, std::atomic & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid); -template +template void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) { if (cpuid >= 0) { int rc = set_thread_affinity(cpuid); @@ -138,6 +139,9 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } store32(noncePtr, nonce); (batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash); + if (v2) { + randomx_calculate_hash_v2(blockTemplate, sizeof(blockTemplate), &hash, &hash); + } result.xorWith(hash); if (!batch) { nonce = atomicNonce.fetch_add(1); @@ -146,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } int main(int argc, char** argv) { - bool softAes, miningMode, verificationMode, help, largePages, jit, secure; + bool softAes, miningMode, verificationMode, help, largePages, jit, secure, v2; bool ssse3, avx2, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; @@ -172,10 +176,11 @@ int main(int argc, char** argv) { readOption("--avx2", argc, argv, avx2); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); + readOption("--v2", argc, argv, v2); store32(&seed, seedValue); - std::cout << "RandomX benchmark v1.1.11" << std::endl; + std::cout << "RandomX benchmark v1.1.12" << std::endl; if (help) { printUsage(argv[0]); @@ -280,11 +285,24 @@ int main(int argc, char** argv) { MineFunc* func; if (noBatch) { - func = &mine; + if (v2) { + std::cout << " - v2 hashes" << std::endl; + func = &mine; + } + else { + func = &mine; + } } else { - func = &mine; - std::cout << " - batch mode" << std::endl; + if (v2) { + //TODO: support batch mode with v2 + std::cout << " - v2 hashes" << std::endl; + func = &mine; + } + else { + std::cout << " - batch mode" << std::endl; + func = &mine; + } } std::cout << "Initializing"; @@ -376,7 +394,7 @@ int main(int argc, char** argv) { randomx_release_cache(cache); std::cout << "Calculated result: "; result.print(std::cout); - if (noncesCount == 1000 && seedValue == 0) + if (noncesCount == 1000 && seedValue == 0 && !v2) std::cout << "Reference result: 10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index 412585b1..8df2d901 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -34,6 +34,14 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) { randomx_calculate_hash(vm, input, H - 1, output); } +template +void calcStringHashV2(const char(&key)[K], const char(&input)[H], void* output) { + initCache(key); + assert(vm != nullptr); + randomx_calculate_hash(vm, input, H - 1, output); + randomx_calculate_hash_v2(input, H - 1, output, output); +} + template void calcHexHash(const char(&key)[K], const char(&hex)[H], void* output) { initCache(key); @@ -1082,6 +1090,22 @@ int main() { assert(rx_get_rounding_mode() == RoundToNearest); }); + if (RANDOMX_HAVE_COMPILER) { + randomx_destroy_vm(vm); + vm = nullptr; +#ifdef RANDOMX_FORCE_SECURE + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr); +#else + vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr); +#endif + } + + runTest("RandomX v2 hash test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + char hash[RANDOMX_HASH_SIZE]; + calcStringHashV2("test key 000", "This is a test", &hash); + assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919")); + }); + randomx_destroy_vm(vm); vm = nullptr; diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index e0625c88..fcc66c99 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -156,7 +156,7 @@ SET ERRORLEVEL = 0 - + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index eb4462a5..eef048a3 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -72,7 +72,7 @@ Source Files - + Source Files From 3f69ad7b79c94e2dcef4515c598e6007ae15ba0b Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 23 Aug 2023 09:47:27 +0200 Subject: [PATCH 02/13] Added CI tests - Compile RandomX on a wide variety of OS and architectures - Fixed broken x86 (32-bit) builds - Don't use broken `fesetenv` in msys2 builds: https://sourceforge.net/p/mingw-w64/bugs/541/ --- .github/workflows/c-cpp.yml | 223 ++++++++++++++++++++++++++++++++++++ CMakeLists.txt | 2 +- src/randomx.cpp | 17 +++ 3 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/c-cpp.yml diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml new file mode 100644 index 00000000..22151c33 --- /dev/null +++ b/.github/workflows/c-cpp.yml @@ -0,0 +1,223 @@ +name: C/C++ CI + +on: [push, pull_request] + +jobs: + build-alpine: + + timeout-minutes: 15 + runs-on: ubuntu-22.04 + + strategy: + matrix: + config: + - {arch: x86_64, branch: latest-stable} + - {arch: x86, branch: latest-stable} + - {arch: aarch64, branch: latest-stable} + - {arch: armhf, branch: latest-stable} + - {arch: armv7, branch: latest-stable} + - {arch: ppc64le, branch: latest-stable} + - {arch: riscv64, branch: edge} + - {arch: s390x, branch: latest-stable} + + steps: + - name: Setup Alpine Linux + uses: jirutka/setup-alpine@v1 + with: + arch: ${{ matrix.config.arch }} + branch: ${{ matrix.config.branch }} + + - name: Install dependencies + shell: alpine.sh --root {0} + run: | + apk add git cmake gcc g++ make + + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Build RandomX + shell: alpine.sh {0} + run: | + mkdir build + cd build + cmake .. + make -j$(nproc) + + - name: Run tests + shell: alpine.sh {0} + run: | + build/randomx-tests + + build-ubuntu: + + timeout-minutes: 5 + runs-on: ${{ matrix.config.os }} + + strategy: + matrix: + config: + - {os: ubuntu-20.04, c: gcc-11, cpp: g++-11} + - {os: ubuntu-22.04, c: gcc-12, cpp: g++-12} + + steps: + - name: Install dependencies + run: | + sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test + sudo apt update + sudo apt install -y git build-essential cmake ${{ matrix.config.c }} ${{ matrix.config.cpp }} + + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: true + + - name: Build RandomX + run: | + mkdir build + cd build + cmake .. + make -j$(nproc) + + - name: Run tests + run: | + build/randomx-tests + + build-windows-msys2: + + timeout-minutes: 15 + runs-on: windows-latest + + strategy: + matrix: + config: + - {c: "gcc", cxx: "g++"} + - {c: "clang", cxx: "clang++"} + + defaults: + run: + shell: msys2 {0} + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Setup MSYS2 + uses: eine/setup-msys2@v2 + with: + update: true + install: mingw-w64-x86_64-toolchain mingw-w64-x86_64-clang mingw-w64-x86_64-lld mingw-w64-x86_64-cmake make + + - name: Build RandomX + run: | + mkdir build + cd build + cmake .. -G "Unix Makefiles" -DCMAKE_C_COMPILER=${{ matrix.config.c }} -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }} + make -j$(nproc) + + - name: Run tests + run: | + build/randomx-tests.exe + + build-windows-msbuild: + + timeout-minutes: 5 + runs-on: windows-${{ matrix.config.os }} + + strategy: + matrix: + config: + - {arch: x64, os: 2019, vs: Visual Studio 16 2019, msbuild: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\MSBuild\\Current\\Bin\\amd64\\"} + - {arch: x64, os: 2022, vs: Visual Studio 17 2022, msbuild: "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\Msbuild\\Current\\Bin\\amd64\\"} + - {arch: Win32, os: 2019, vs: Visual Studio 16 2019, msbuild: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\MSBuild\\Current\\Bin\\"} + - {arch: Win32, os: 2022, vs: Visual Studio 17 2022, msbuild: "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\Msbuild\\Current\\Bin\\"} + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Setup cmake + uses: lukka/get-cmake@latest + + - name: Build RandomX + run: | + mkdir build + cd build + cmake .. -G "${{ matrix.config.vs }}" -A ${{ matrix.config.arch }} + & "${{ matrix.config.msbuild }}msbuild" -v:m /m /p:Configuration=Release randomx-tests.vcxproj + + - name: Run tests + run: | + build/Release/randomx-tests.exe + + build-macos: + + timeout-minutes: 5 + runs-on: ${{ matrix.os }} + + strategy: + matrix: + os: [macos-11, macos-12, macos-13] + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Install dependencies + run: HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake + + - name: Build RandomX + run: | + mkdir build + cd build + cmake .. + make -j3 + + - name: Run tests + run: | + build/randomx-tests + + build-freebsd: + + timeout-minutes: 15 + runs-on: ${{ matrix.os.host }} + + strategy: + matrix: + os: + - name: freebsd + architecture: x86-64 + version: '13.2' + host: ubuntu-22.04 + + - name: freebsd + architecture: arm64 + version: '13.2' + host: ubuntu-22.04 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Build RandomX + uses: cross-platform-actions/action@v0.19.0 + with: + operating_system: ${{ matrix.os.name }} + architecture: ${{ matrix.os.architecture }} + version: ${{ matrix.os.version }} + shell: bash + run: | + sudo pkg install -y cmake + mkdir build && cd build + cmake .. + make -j2 + ./randomx-tests diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b6ba9e6..5ffbe011 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,7 +96,7 @@ function(add_flag flag) endfunction() # x86-64 -if(ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64") +if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")) list(APPEND randomx_sources src/jit_compiler_x86.cpp) diff --git a/src/randomx.cpp b/src/randomx.cpp index 7daaa46d..5d77ef36 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -36,7 +36,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cpu.hpp" #include #include + +#if defined(__SSE__) || defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP > 0)) +#define USE_CSR_INTRINSICS +#include +#else #include +#endif extern "C" { @@ -356,8 +362,14 @@ extern "C" { assert(machine != nullptr); assert(inputSize == 0 || input != nullptr); assert(output != nullptr); + +#ifdef USE_CSR_INTRINSICS + const unsigned int fpstate = _mm_getcsr(); +#else fenv_t fpstate; fegetenv(&fpstate); +#endif + alignas(16) uint64_t tempHash[8]; int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0); assert(blakeResult == 0); @@ -370,7 +382,12 @@ extern "C" { } machine->run(&tempHash); machine->getFinalResult(output, RANDOMX_HASH_SIZE); + +#ifdef USE_CSR_INTRINSICS + _mm_setcsr(fpstate); +#else fesetenv(&fpstate); +#endif } void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) { From e372827fae8058d7e881e0bb8c81d066604a5db9 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 8 Sep 2023 22:36:45 +0200 Subject: [PATCH 03/13] fix vcxproj files --- vcxproj/randomx-dll.vcxproj | 4 ++-- vcxproj/randomx-dll.vcxproj.filters | 4 ++-- vcxproj/randomx.vcxproj | 4 ++-- vcxproj/randomx.vcxproj.filters | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vcxproj/randomx-dll.vcxproj b/vcxproj/randomx-dll.vcxproj index 8b8ea8c0..4eaae9be 100644 --- a/vcxproj/randomx-dll.vcxproj +++ b/vcxproj/randomx-dll.vcxproj @@ -43,7 +43,7 @@ - + @@ -74,7 +74,7 @@ - + diff --git a/vcxproj/randomx-dll.vcxproj.filters b/vcxproj/randomx-dll.vcxproj.filters index 68e1b855..5b51f9f7 100644 --- a/vcxproj/randomx-dll.vcxproj.filters +++ b/vcxproj/randomx-dll.vcxproj.filters @@ -87,7 +87,7 @@ Header Files - + Header Files @@ -151,7 +151,7 @@ Source Files - + Source Files diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj index e0625c88..cefdc8fb 100644 --- a/vcxproj/randomx.vcxproj +++ b/vcxproj/randomx.vcxproj @@ -156,7 +156,7 @@ SET ERRORLEVEL = 0 - + @@ -198,7 +198,7 @@ SET ERRORLEVEL = 0 - + diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters index eb4462a5..7f055b5b 100644 --- a/vcxproj/randomx.vcxproj.filters +++ b/vcxproj/randomx.vcxproj.filters @@ -72,7 +72,7 @@ Source Files - + Source Files @@ -164,7 +164,7 @@ Header Files - + Header Files From 07a413b9f00b64d18cef310582427bf738abd94d Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 8 Sep 2023 22:57:09 +0200 Subject: [PATCH 04/13] rename 'hash v2' to 'commitment' --- src/randomx.cpp | 10 +++++----- src/randomx.h | 10 +++++----- src/tests/benchmark.cpp | 24 ++++++++++++------------ src/tests/tests.cpp | 8 ++++---- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/randomx.cpp b/src/randomx.cpp index 537dde2e..c963e211 100644 --- a/src/randomx.cpp +++ b/src/randomx.cpp @@ -401,14 +401,14 @@ extern "C" { machine->getFinalResult(output, RANDOMX_HASH_SIZE); } - void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out) { + void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) { assert(inputSize == 0 || input != nullptr); - assert(v1_in != nullptr); - assert(v2_out != nullptr); + assert(hash_in != nullptr); + assert(com_out != nullptr); blake2b_state state; blake2b_init(&state, RANDOMX_HASH_SIZE); blake2b_update(&state, input, inputSize); - blake2b_update(&state, v1_in, RANDOMX_HASH_SIZE); - blake2b_final(&state, v2_out, RANDOMX_HASH_SIZE); + blake2b_update(&state, hash_in, RANDOMX_HASH_SIZE); + blake2b_final(&state, com_out, RANDOMX_HASH_SIZE); } } diff --git a/src/randomx.h b/src/randomx.h index d7e2d998..313bcd2e 100644 --- a/src/randomx.h +++ b/src/randomx.h @@ -261,15 +261,15 @@ RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output); /** - * Calculate V2 hash from the V1 hash and its input. + * Calculate a RandomX commitment from a RandomX hash and its input. * - * @param input is a pointer to memory that was hashed by V1. Must not be NULL. + * @param input is a pointer to memory that was hashed. Must not be NULL. * @param inputSize is the number of bytes in the input. - * @param v1_in is the V1 hash (RANDOMX_HASH_SIZE bytes). - * @param output is a pointer to memory where the V2 hash will be stored. Must not + * @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes). + * @param com_out is a pointer to memory where the commitment will be stored. Must not * be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing. */ -RANDOMX_EXPORT void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out); +RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out); #if defined(__cplusplus) } diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index df371682..d25d0c2c 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -96,7 +96,7 @@ void printUsage(const char* executable) { std::cout << " --avx2 use optimized Argon2 for AVX2 CPUs" << std::endl; std::cout << " --auto select the best options for the current CPU" << std::endl; std::cout << " --noBatch calculate hashes one by one (default: batch)" << std::endl; - std::cout << " --v2 calculate v2 hashes (default: v1)" << std::endl; + std::cout << " --commit calculate commitments instead of hashes (default: hashes)" << std::endl; } struct MemoryException : public std::exception { @@ -114,7 +114,7 @@ struct DatasetAllocException : public MemoryException { using MineFunc = void(randomx_vm * vm, std::atomic & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid); -template +template void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) { if (cpuid >= 0) { int rc = set_thread_affinity(cpuid); @@ -139,8 +139,8 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } store32(noncePtr, nonce); (batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash); - if (v2) { - randomx_calculate_hash_v2(blockTemplate, sizeof(blockTemplate), &hash, &hash); + if (commit) { + randomx_calculate_commitment(blockTemplate, sizeof(blockTemplate), &hash, &hash); } result.xorWith(hash); if (!batch) { @@ -150,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic& atomicNonce, AtomicHash& result } int main(int argc, char** argv) { - bool softAes, miningMode, verificationMode, help, largePages, jit, secure, v2; + bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit; bool ssse3, avx2, autoFlags, noBatch; int noncesCount, threadCount, initThreadCount; uint64_t threadAffinity; @@ -176,7 +176,7 @@ int main(int argc, char** argv) { readOption("--avx2", argc, argv, avx2); readOption("--auto", argc, argv, autoFlags); readOption("--noBatch", argc, argv, noBatch); - readOption("--v2", argc, argv, v2); + readOption("--commit", argc, argv, commit); store32(&seed, seedValue); @@ -285,8 +285,8 @@ int main(int argc, char** argv) { MineFunc* func; if (noBatch) { - if (v2) { - std::cout << " - v2 hashes" << std::endl; + if (commit) { + std::cout << " - hash commitments" << std::endl; func = &mine; } else { @@ -294,9 +294,9 @@ int main(int argc, char** argv) { } } else { - if (v2) { - //TODO: support batch mode with v2 - std::cout << " - v2 hashes" << std::endl; + if (commit) { + //TODO: support batch mode with commitments + std::cout << " - hash commitments" << std::endl; func = &mine; } else { @@ -394,7 +394,7 @@ int main(int argc, char** argv) { randomx_release_cache(cache); std::cout << "Calculated result: "; result.print(std::cout); - if (noncesCount == 1000 && seedValue == 0 && !v2) + if (noncesCount == 1000 && seedValue == 0 && !commit) std::cout << "Reference result: 10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl; if (!miningMode) { std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl; diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp index 8df2d901..5e1b41a3 100644 --- a/src/tests/tests.cpp +++ b/src/tests/tests.cpp @@ -35,11 +35,11 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) { } template -void calcStringHashV2(const char(&key)[K], const char(&input)[H], void* output) { +void calcStringCommitment(const char(&key)[K], const char(&input)[H], void* output) { initCache(key); assert(vm != nullptr); randomx_calculate_hash(vm, input, H - 1, output); - randomx_calculate_hash_v2(input, H - 1, output, output); + randomx_calculate_commitment(input, H - 1, output, output); } template @@ -1100,9 +1100,9 @@ int main() { #endif } - runTest("RandomX v2 hash test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { + runTest("Commitment test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() { char hash[RANDOMX_HASH_SIZE]; - calcStringHashV2("test key 000", "This is a test", &hash); + calcStringCommitment("test key 000", "This is a test", &hash); assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919")); }); From e322218fb7f2f2d888ecfb9ab2e8293141e67db6 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 9 Oct 2023 18:38:25 +0200 Subject: [PATCH 05/13] Fixed casts from const to non-const pointers --- src/intrin_portable.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/intrin_portable.h b/src/intrin_portable.h index 8c09ae88..50020c3e 100644 --- a/src/intrin_portable.h +++ b/src/intrin_portable.h @@ -349,7 +349,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; vec_u c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -375,8 +375,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { vec_u x; - x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return (rx_vec_f128)x.d; } @@ -684,7 +684,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const* p) { #if defined(NATIVE_LITTLE_ENDIAN) return *p; #else - uint32_t* ptr = (uint32_t*)p; + const uint32_t* ptr = (const uint32_t*)p; rx_vec_i128 c; c.u32[0] = load32(ptr + 0); c.u32[1] = load32(ptr + 1); @@ -708,8 +708,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) { FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) { rx_vec_f128 x; - x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0)); - x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4)); + x.lo = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0)); + x.hi = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4)); return x; } From 48fa275d04a4fc5e9666d206b50c337a5cfcfe7a Mon Sep 17 00:00:00 2001 From: tevador Date: Mon, 9 Oct 2023 19:14:51 +0200 Subject: [PATCH 06/13] Avoid redundant CI runs --- .github/workflows/c-cpp.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 22151c33..47ade398 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -1,6 +1,9 @@ name: C/C++ CI -on: [push, pull_request] +on: + push: + branches: [ master ] + pull_request: jobs: build-alpine: From 027ecb85769d6ebeff0fcb5ddcbafb027209debd Mon Sep 17 00:00:00 2001 From: tevador Date: Sat, 7 Oct 2023 12:51:19 +0200 Subject: [PATCH 07/13] JIT compiler for RISC-V --- CMakeLists.txt | 36 + src/common.hpp | 7 + src/jit_compiler.hpp | 42 +- src/jit_compiler_rv64.cpp | 1175 ++++++++++++++++++++++++++++ src/jit_compiler_rv64.hpp | 69 ++ src/jit_compiler_rv64_static.S | 1235 ++++++++++++++++++++++++++++++ src/jit_compiler_rv64_static.hpp | 53 ++ src/tests/riscv64_zba.s | 9 + src/tests/riscv64_zbb.s | 9 + 9 files changed, 2633 insertions(+), 2 deletions(-) create mode 100644 src/jit_compiler_rv64.cpp create mode 100644 src/jit_compiler_rv64.hpp create mode 100644 src/jit_compiler_rv64_static.S create mode 100644 src/jit_compiler_rv64_static.hpp create mode 100644 src/tests/riscv64_zba.s create mode 100644 src/tests/riscv64_zbb.s diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ffbe011..ebbdff2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -173,6 +173,42 @@ if(ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv endif() endif() +# RISC-V +if(ARCH_ID STREQUAL "riscv64") + list(APPEND randomx_sources + src/jit_compiler_rv64_static.S + src/jit_compiler_rv64.cpp) + # cheat because cmake and ccache hate each other + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY LANGUAGE C) + set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm) + + # default build uses the RV64GC baseline + set(RVARCH "rv64gc") + + # for native builds, enable Zba and Zbb if supported by the CPU + if(ARCH STREQUAL "native") + enable_language(ASM) + try_run(RANDOMX_ZBA_RUN_FAIL + RANDOMX_ZBA_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zba.s + COMPILE_DEFINITIONS "-march=rv64gc_zba") + if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL) + set(RVARCH "${RVARCH}_zba") + endif() + try_run(RANDOMX_ZBB_RUN_FAIL + RANDOMX_ZBB_COMPILE_OK + ${CMAKE_CURRENT_BINARY_DIR}/ + ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zbb.s + COMPILE_DEFINITIONS "-march=rv64gc_zbb") + if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL) + set(RVARCH "${RVARCH}_zbb") + endif() + endif() + + add_flag("-march=${RVARCH}") +endif() + set(RANDOMX_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/src" CACHE STRING "RandomX Include path") add_library(randomx ${randomx_sources}) diff --git a/src/common.hpp b/src/common.hpp index a77feb3b..f4b85342 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -116,12 +116,19 @@ namespace randomx { #if defined(_M_X64) || defined(__x86_64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_X86 class JitCompilerX86; using JitCompiler = JitCompilerX86; #elif defined(__aarch64__) #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_A64 class JitCompilerA64; using JitCompiler = JitCompilerA64; +#elif defined(__riscv) && __riscv_xlen == 64 + #define RANDOMX_HAVE_COMPILER 1 + #define RANDOMX_COMPILER_RV64 + class JitCompilerRV64; + using JitCompiler = JitCompilerRV64; #else #define RANDOMX_HAVE_COMPILER 0 class JitCompilerFallback; diff --git a/src/jit_compiler.hpp b/src/jit_compiler.hpp index 17fdad4e..5b76fa5f 100644 --- a/src/jit_compiler.hpp +++ b/src/jit_compiler.hpp @@ -28,10 +28,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once -#if defined(_M_X64) || defined(__x86_64__) +#include "common.hpp" + +namespace randomx { + + struct CodeBuffer { + uint8_t* code; + int32_t codePos; + int32_t rcpCount; + + void emit(const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + codePos += len; + } + + template + void emit(T src) { + memcpy(&code[codePos], &src, sizeof(src)); + codePos += sizeof(src); + } + + void emitAt(int32_t codePos, const uint8_t* src, int32_t len) { + memcpy(&code[codePos], src, len); + } + + template + void emitAt(int32_t codePos, T src) { + memcpy(&code[codePos], &src, sizeof(src)); + } + }; + + struct CompilerState : public CodeBuffer { + int32_t instructionOffsets[RANDOMX_PROGRAM_SIZE]; + int registerUsage[RegistersCount]; + }; +} + +#if defined(RANDOMX_COMPILER_X86) #include "jit_compiler_x86.hpp" -#elif defined(__aarch64__) +#elif defined(RANDOMX_COMPILER_A64) #include "jit_compiler_a64.hpp" +#elif defined(RANDOMX_COMPILER_RV64) +#include "jit_compiler_rv64.hpp" #else #include "jit_compiler_fallback.hpp" #endif diff --git a/src/jit_compiler_rv64.cpp b/src/jit_compiler_rv64.cpp new file mode 100644 index 00000000..301c294c --- /dev/null +++ b/src/jit_compiler_rv64.cpp @@ -0,0 +1,1175 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include "jit_compiler_rv64.hpp" +#include "jit_compiler_rv64_static.hpp" +#include "superscalar.hpp" +#include "program.hpp" +#include "reciprocal.h" +#include "virtual_memory.h" + + +namespace { +#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i + using InstructionHandler = void(HANDLER_ARGS); + extern InstructionHandler* opcodeMap1[256]; +} + +namespace rv64 { + constexpr uint16_t C_LUI = 0x6001; + constexpr uint32_t LUI = 0x00000037; + constexpr uint16_t C_ADDI = 0x0001; + constexpr uint32_t ADDI = 0x00000013; + constexpr uint32_t ADDIW = 0x0000001b; + constexpr uint16_t C_ADD = 0x9002; + constexpr uint32_t ADD = 0x00000033; + constexpr uint32_t SHXADD = 0x20000033; //Zba + constexpr uint32_t SLL = 0x00001033; + constexpr uint32_t SRL = 0x00005033; + constexpr uint32_t SLLI = 0x00001013; + constexpr uint32_t C_SLLI = 0x0002; + constexpr uint32_t SRLI = 0x00005013; + constexpr uint32_t AND = 0x00007033; + constexpr uint32_t ANDI = 0x00007013; + constexpr uint16_t C_AND = 0x8c61; + constexpr uint16_t C_ANDI = 0x8801; + constexpr uint32_t OR = 0x00006033; + constexpr uint16_t C_OR = 0x8c41; + constexpr uint32_t XOR = 0x00004033; + constexpr uint16_t C_XOR = 0x8c21; + constexpr uint32_t LD = 0x00003003; + constexpr uint16_t C_LD = 0x6000; + constexpr uint16_t C_LW = 0x4000; + constexpr uint32_t SD = 0x00003023; + constexpr uint32_t SUB = 0x40000033; + constexpr uint16_t C_SUB = 0x8c01; + constexpr uint32_t MUL = 0x02000033; + constexpr uint32_t MULHU = 0x02003033; + constexpr uint32_t MULH = 0x02001033; + constexpr uint16_t C_MV = 0x8002; + constexpr uint32_t ROR = 0x60005033; //Zbb + constexpr uint32_t RORI = 0x60005013; //Zbb + constexpr uint32_t ROL = 0x60001033; //Zbb + constexpr uint32_t FMV_X_D = 0xe2000053; + constexpr uint32_t FMV_D_X = 0xf2000053; + constexpr uint32_t FMV_D = 0x22000053; + constexpr uint32_t FADD_D = 0x02007053; + constexpr uint32_t FSUB_D = 0x0a007053; + constexpr uint32_t FMUL_D = 0x12007053; + constexpr uint32_t FDIV_D = 0x1a007053; + constexpr uint32_t FSQRT_D = 0x5a007053; + constexpr uint32_t FCVT_D_W = 0xd2000053; + constexpr uint32_t FSRM = 0x00201073; + constexpr uint16_t C_BEQZ = 0xc001; + constexpr uint32_t BEQ = 0x00000063; + constexpr uint16_t C_BNEZ = 0xe001; + constexpr uint32_t JAL = 0x0000006f; + constexpr uint16_t C_RET = 0x8082; +} + +namespace randomx { + + constexpr size_t MaxRandomXInstrCodeSize = 56; //FDIV_M requires 56 bytes of rv64 code + constexpr size_t MaxSuperscalarInstrSize = 12; //IXOR_C requires 12 bytes of rv64 code + constexpr size_t SuperscalarProgramHeader = 136; //overhead per superscalar program + constexpr size_t CodeAlign = 4096; //align code size to a multiple of 4 KiB + constexpr size_t LiteralPoolSize = CodeAlign; + constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_ACCESSES * CodeAlign; + constexpr size_t ReserveCodeSize = CodeAlign; //prologue, epilogue + reserve + + constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign); + constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign); + + static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large"); + static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large"); + + constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize; + constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize; + + constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2; + constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize; + constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset; + constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize; + + constexpr int maskLog2(uint32_t x, int prev) { + return x == 1 ? prev : maskLog2(x >> 1, prev + 1); + } + + constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) { + return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x); + } + + constexpr int MaskL1Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L1, 0); + constexpr int MaskL2Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L2, 0); + constexpr int MaskL3Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L3, 0); + + constexpr int RcpLiteralsOffset = 144; + + constexpr int LiteralPoolReg = 3; //x3 + constexpr int SpadReg = 5; //x5 + constexpr int DataReg = 6; //x6 + constexpr int SuperscalarReg = 7; //x7 + constexpr int SshTmp1Reg = 28; //x28 + constexpr int SshTmp2Reg = 29; //x29 + constexpr int SshPoolReg = 30; //x30 + constexpr int SshRcpReg = 31; //x31 + constexpr int Tmp1Reg = 8; //x8 + constexpr int Tmp2Reg = 9; //x9 + constexpr int Tmp1RegF = 24; //f24 + constexpr int Tmp2RegF = 25; //f25 + constexpr int MaskL1Reg = 10; //x10 + constexpr int MaskL2Reg = 11; //x11 + constexpr int MaskFscalReg = 12; //x12 + constexpr int MaskEclear = 13; //x13 + constexpr int MaskEsetLo = 14; //x14 + constexpr int MaskEsetHi = 15; //x15 + constexpr int MaskL3Reg = 1; //x1 + constexpr int ReturnReg = 1; //x1 + constexpr int SpAddr0Reg = 26; //x26 + constexpr int OffsetXC = -8; //x8-x15 + constexpr int OffsetR = 16; //x16-x23 + constexpr int OffsetF = 0; //f0-f7 + constexpr int OffsetE = 8; //f8-f15 + constexpr int OffsetA = 16; //f16-f23 + constexpr int OffsetRcp = 28; //x28-x31 + constexpr int OffsetRcpF = 22; //f26-f31 + constexpr int OffsetSsh = 8; //x8-x15 + + //destination register (bit 7+) + constexpr int rvrd(int reg) { + return reg << 7; + } + + //first source register (bit 15+) + constexpr int rvrs1(int reg) { + return reg << 15; + } + + //second source register (bit 20+) + constexpr int rvrs2(int reg) { + return reg << 20; + } + + //compressed source register (bit 2+) + constexpr int rvcrs(int reg) { + return reg << 2; + } + + //base instruction: {op} x{rd}, x{rs1}, x{rs2} + constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) { + return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd); + } + + //compressed instruction: op x{rd}, x{rs} + constexpr uint16_t rvc(uint16_t op, int rd, int rs) { + return op | rvrd(rd) | rvcrs(rs); + } + + //compressed instruction: op x{rd}, imm6 + constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) { + return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2); + } + + constexpr int regR(int reg) { + return reg + OffsetR; + } + + constexpr int regLoA(int reg) { + return 2 * reg + OffsetA; + } + + constexpr int regHiA(int reg) { + return 2 * reg + OffsetA + 1; + } + + constexpr int regLoF(int reg) { + return 2 * reg + OffsetF; + } + + constexpr int regHiF(int reg) { + return 2 * reg + OffsetF + 1; + } + + constexpr int regLoE(int reg) { + return 2 * reg + OffsetE; + } + + constexpr int regHiE(int reg) { + return 2 * reg + OffsetE + 1; + } + + constexpr int regRcp(int reg) { + return reg + OffsetRcp; + } + + constexpr int regRcpF(int reg) { + return reg + OffsetRcpF; + } + + constexpr int regSS(int reg) { + return reg + OffsetSsh; + } + + static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals; + static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end; + static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init; + static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call; + static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue; + static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin; + static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read; + static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light; + static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call; + static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store; + static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes; + static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes; + static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end; + static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop; + static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue; + static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes; + static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end; + static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init; + static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load; + static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch; + static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end; + + static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals; + static const int32_t sizeDataInit = codePrologue - codeDataInit; + static const int32_t sizePrologue = codeLoopBegin - codePrologue; + static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin; + static const int32_t sizeDataRead = codeDataReadLight - codeDataRead; + static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight; + static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore; + static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes; + static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd; + static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue; + static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes; + static const int32_t sizeSshInit = codeSshLoad - codeSshInit; + static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad; + static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch; + + static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit; + static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight; + static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd; + + static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue; + static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin; + + static void clearCache(CodeBuffer& buf) { +#ifdef __GNUC__ + __builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize)); +#endif + } + + //emits code to calculate: x{dst} = x{src} + {imm32} + //takes 1-3 isns, 2-10 bytes + static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) { + + //lower 12 bits + int32_t limm = (imm << 20) >> 20; + //upper 20 bits + int32_t uimm = (imm >> 12) + (limm < 0); + + //If there are no upper bits, the whole thing + //can be done with a single instruction. + if (uimm == 0) { + //addi x{dst}, x{src}, {limm} + buf.emit(rvi(rv64::ADDI, dst, src, limm)); + return; + } + + //dst1 is the register where imm will be materialized + int dst1 = src != dst ? dst : tmp; + assert(dst1 != 0); + //src1 is the register that will be added to the result + int src1 = src != dst ? src : dst1; + + //load upper bits + if (uimm >= -32 && uimm <= 31) { + //c.lui x{dst1}, {uimm} + buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31))); + } + else { + //lui x{dst1}, {uimm} + buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1)); + } + //load lower bits + if (limm != 0) { + //Note: this must be addiw NOT addi, otherwise the upper 32 bits + //of the 64-bit register will be incorrect. + //addiw x{dst1}, x{dst1}, {limm} + buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm)); + } + //add src + if (src1 != 0) { + //c.add x{dst}, x{src1} + buf.emit(rvc(rv64::C_ADD, dst, src1)); + } + } + + //x9 = &Scratchpad[isn.imm] + //takes 3 isns, 10 bytes + static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) { + //signed offset 8-byte aligned + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask; + //x9 = x5 + {imm} + emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg); + } + + //x9 = &Scratchpad[isn.src + isn.imm] (for reading) + //takes 5 isns, 12 bytes + static void genAddressReg(CodeBuffer& buf, const Instruction& isn) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{src} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC))); + //c.add x9, x{spadReg} + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + + //x8 = Scratchpad[isn] + static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) { + if (isn.src != isn.dst) { + //x9 = &Scratchpad[isn.src + isn.imm] + genAddressReg(buf, isn); + } + else { + ///x9 = &Scratchpad[isn.imm] + genAddressRegImm(buf, isn); + } + //c.ld x8, 0(x9) + buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + } + + //x9 = &Scratchpad[isn.dst + isn.imm32] (for writing) + //takes 5 isns, 12-16 bytes + static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) { + if (isn.getModCond() < StoreL3Condition) { + int shift, maskReg; + if (isn.getModMem()) { + shift = MaskL1Shift; + maskReg = MaskL1Reg; + } + else { + shift = MaskL2Shift; + maskReg = MaskL2Reg; + } + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //c.and x9, x{maskReg} + buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + else { + int shift = MaskL3Shift; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm = (imm << shift) >> shift; + //x9 = x{dst} + {imm} + emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg); + //and x9, x9, x1 + buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg)); + //c.add x9, x5 + buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg)); + } + } + + static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) { + //first 238 at positive offsets + if (buf.rcpCount < 238) { + buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal); + buf.rcpCount++; + } + //next 256 at negative offsets + else if (buf.rcpCount < 494) { + buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal); + buf.rcpCount++; + } + else { + //checked at compile time, but double-check here + throw std::runtime_error("Literal pool overflow"); + } + } + + static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, int32_t numLiterals) { + //store the current literal in the pool + int32_t offset = 2040 - buf.rcpCount * 8; + buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal); + buf.rcpCount++; + if (buf.rcpCount >= numLiterals) { + return; + } + //load the next literal + offset -= 8; + int32_t imm = offset & 0xfff; + //ld x31, {offset}(x30) + buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm)); + if (imm == 0x800) { + //move pool pointer back 4KB + //c.lui x29, 0xfffff + buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31)); + //c.add x30, x29 + buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg)); + } + } + + static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) { + int32_t imm = targetPos - codePos; + int32_t imm20 = (imm < 0) << 11; + int32_t imm1912 = (imm >> 7) & 8160; + int32_t imm11 = (imm >> 11) & 1; + int32_t imm101 = imm & 2046; + //jal x{dst}, {imm} + buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11)); + } + + static void emitInstruction(CompilerState& state, Instruction isn, int i) { + state.instructionOffsets[i] = state.codePos; + opcodeMap1[isn.opcode](state, isn, i); + } + + static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) { + state.codePos = RandomXCodePos; + state.rcpCount = 0; + state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]); + state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]); + for (unsigned i = 0; i < RegistersCount; ++i) { + state.registerUsage[i] = -1; + } + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + instr.src %= RegistersCount; + instr.dst %= RegistersCount; + emitInstruction(state, instr, i); + } + } + + static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) { + state.emit(codeSpadStore, sizeSpadStore); + int32_t fixPos = state.codePos; + state.emit(codeLoopEnd, sizeLoopEnd); + //xor x26, x{readReg0}, x{readReg1} + state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1))); + fixPos += offsetFixContinueLoop; + //j LoopTop + emitJump(state, 0, fixPos, LoopTopPos); + state.emit(codeEpilogue, sizeEpilogue); + } + + static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, const std::vector& reciprocalCache) { + switch ((SuperscalarInstructionType)isn.opcode) + { + case randomx::SuperscalarInstructionType::ISUB_R: + //c.sub x{dst}, x{src} + buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IXOR_R: + //c.xor x{dst}, x{src} + buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC)); + break; + case randomx::SuperscalarInstructionType::IADD_RS: + { + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst))); +#else + //slli x28, x{src}, {shift} + buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift)); + //c.add x{dst}, x28 + buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg)); +#endif + } + } + break; + case randomx::SuperscalarInstructionType::IMUL_R: + //mul x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IROR_C: + { +#ifdef __riscv_zbb + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm)); +#else + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x28, x{dst}, {immr} + buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr)); + //c.slli x{dst}, {imml} + buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40)); + //or x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); +#endif + } + break; + case randomx::SuperscalarInstructionType::IADD_C7: + case randomx::SuperscalarInstructionType::IADD_C8: + case randomx::SuperscalarInstructionType::IADD_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg); + } + break; + case randomx::SuperscalarInstructionType::IXOR_C7: + case randomx::SuperscalarInstructionType::IXOR_C8: + case randomx::SuperscalarInstructionType::IXOR_C9: + { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x28 = {imm} + emitImm32(buf, imm, SshTmp1Reg); + //xor x{dst}, x{dst}, x28 + buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg)); + } + break; + case randomx::SuperscalarInstructionType::IMULH_R: + //mulhu x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::ISMULH_R: + //mulh x{dst}, x{dst}, x{src} + buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src))); + break; + case randomx::SuperscalarInstructionType::IMUL_RCP: + //mul x{dst}, x{dst}, x31 + buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg)); + //load the next literal into x31 + emitRcpLiteral2(buf, reciprocalCache[isn.getImm32()], reciprocalCache.size()); + break; + default: + UNREACHABLE; + } + } + + size_t JitCompilerRV64::getCodeSize() { + return CodeSize; + } + + JitCompilerRV64::JitCompilerRV64() { + state.code = (uint8_t*)allocMemoryPages(CodeSize); + if (state.code == nullptr) + throw std::runtime_error("allocMemoryPages"); + state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals); + state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin); + entryDataInit = state.code + LiteralPoolSize; + entryProgram = state.code + LiteralPoolSize + sizeDataInit; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset); + } + + JitCompilerRV64::~JitCompilerRV64() { + freePagedMemory(state.code, CodeSize); + } + + void JitCompilerRV64::enableAll() { + setPagesRWX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableWriting() { + setPagesRW(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::enableExecution() { + setPagesRX(entryDataInit, ExecutableSize); + } + + void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataRead, sizeDataRead); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { + emitProgramPrefix(state, prog, pcfg); + int32_t fixPos = state.codePos; + state.emit(codeDataReadLight, sizeDataReadLight); + //xor x8, x{readReg2}, x{readReg3} + state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3))); + int32_t imm = datasetOffset / CacheLineSize; + int32_t limm = (imm << 20) >> 20; + int32_t uimm = (imm >> 12) + (limm < 0); + //lui x9, {uimm} + state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg)); + //addi x9, x9, {limm} + state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm)); + fixPos += offsetFixLoopCall; + //jal x1, SuperscalarHash + emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset); + emitProgramSuffix(state, pcfg); + clearCache(state); + } + + void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector& reciprocalCache) { + state.codePos = SuperScalarHashOffset; + state.rcpCount = 0; + state.emit(codeSshInit, sizeSshInit); + for (unsigned j = 0; j < RANDOMX_CACHE_ACCESSES; ++j) { + SuperscalarProgram& prog = programs[j]; + for (unsigned i = 0; i < prog.getSize(); ++i) { + Instruction instr = prog(i); + generateSuperscalarCode(state, instr, reciprocalCache); + } + state.emit(codeSshLoad, sizeSshLoad); + if (j < RANDOMX_CACHE_ACCESSES - 1) { + int32_t fixPos = state.codePos; + state.emit(codeSshPrefetch, sizeSshPrefetch); + //and x7, x{addrReg}, x7 + state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg)); + } + } + state.emit(rvc(rv64::C_RET, 0, 0)); + clearCache(state); + } + + static void v1_IADD_RS(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + int shift = isn.getModShift(); + if (shift == 0) { + //c.add x{dst}, x{src} + state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src))); + } + else { +#ifdef __riscv_zba + //sh{1,2,3}add x{dst}, x{src}, x{dst} + state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst))); +#else + //slli x8, x{src}, {shift} + state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift)); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); +#endif + } + if (isn.dst == RegisterNeedsDisplacement) { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x{dst} = x{dst} + {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_IADD_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //c.add x{dst}, x8 + state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISUB_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //sub x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add + //x{dst} = x{dst} + {-imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg); + } + } + + static void v1_ISUB_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //sub x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //mul x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IMUL_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulhu x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_IMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulhu x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_ISMULH_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //mulh x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + + static void v1_ISMULH_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //mulh x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IMUL_RCP(HANDLER_ARGS) { + uint64_t divisor = isn.getImm32(); + if (!isZeroOrPowerOf2(divisor)) { + state.registerUsage[isn.dst] = i; + if (state.rcpCount < 4) { + //mul x{dst}, x{dst}, x{rcp} + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount))); + } + else if (state.rcpCount < 10) { + //fmv.x.d x8, f{rcp} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount))); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + else { + int32_t offset = RcpLiteralsOffset + state.rcpCount * 8; + //ld x8, {offset}(x3) + state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset)); + //mul x{dst}, x{dst}, x8 + state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + emitRcpLiteral1(state, randomx_reciprocal_fast(divisor)); + } + } + + static void v1_INEG_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + //sub x{dst}, x0, x{dst} + state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst))); + } + + static void v1_IXOR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + if (isn.src != isn.dst) { + //xor x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + //x8 = {imm} + emitImm32(state, imm, Tmp1Reg); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + } + + static void v1_IXOR_M(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; + loadFromScratchpad(state, isn); + //xor x{dst}, x{dst}, x8 + state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } + + static void v1_IROR_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //ror x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //srl x9, x{dst}, x{src} + state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //sll x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t immr = isn.getImm32() & 63; + int32_t imml = -immr & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_IROL_R(HANDLER_ARGS) { + state.registerUsage[isn.dst] = i; +#ifdef __riscv_zbb + if (isn.src != isn.dst) { + //rol x{dst}, x{dst}, x{src} + state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src))); + } + else { + int32_t imm = -isn.getImm32() & 63; + //rori x{dst}, x{dst}, {imm} + state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm)); + } +#else + if (isn.src != isn.dst) { + //sub x8, x0, x{src} + state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src))); + //sll x9, x{dst}, x{src} + state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src))); + //srl x{dst}, x{dst}, x8 + state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + //or x{dst}, x{dst}, x9 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg)); + } + else { + int32_t imml = isn.getImm32() & 63; + int32_t immr = -imml & 63; + int32_t imml5 = imml >> 5; + int32_t imml40 = imml & 31; + //srli x8, x{dst}, {immr} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr)); + //c.slli x{dst}, {imml} + state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40)); + //or x{dst}, x{dst}, x8 + state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg)); + } +#endif + } + + static void v1_ISWAP_R(HANDLER_ARGS) { + if (isn.src != isn.dst) { + state.registerUsage[isn.dst] = i; + state.registerUsage[isn.src] = i; + //c.mv x8, x{dst} + state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst))); + //c.mv x{dst}, x{src} + state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src))); + //c.mv x{src}, x8 + state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg)); + } + } + + static void v1_FSWAP_R(HANDLER_ARGS) { + //fmv.d f24, f{dst_lo} + state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst))); + //fmv.d f{dst_lo}, f{dst_hi} + state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst))); + //fmv.d f{dst_hi}, f24 + state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF)); + } + + static void v1_FADD_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fadd.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fadd.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FADD_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fadd.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fadd.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSUB_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fsub.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src))); + //fsub.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src))); + } + + static void v1_FSUB_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //c.lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //c.lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fsub.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF)); + //fsub.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF)); + } + + static void v1_FSCAL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fmv.x.d x8, f{dst_lo} + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst))); + //fmv.x.d x9, f{dst_hi} + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst))); + //c.xor x8, x12 + state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //c.xor x9, x12 + state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC)); + //fmv.d.x f{dst_lo}, x8 + state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg)); + //fmv.d.x f{dst_hi}, x9 + state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg)); + } + + static void v1_FMUL_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + isn.src %= RegisterCountFlt; + //fmul.d f{dst_lo}, f{dst_lo}, f{src_lo} + state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src))); + //fmul.d f{dst_hi}, f{dst_hi}, f{src_hi} + state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src))); + } + + static void v1_FDIV_M(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //x9 = mem + genAddressReg(state, isn); + //lw x8, 0(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC)); + //lw x9, 4(x9) + state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC)); + //fcvt.d.w f24, x8 + state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg)); + //fcvt.d.w f25, x9 + state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg)); + //fmv.x.d x8, f24 + state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF)); + //fmv.x.d x9, f25 + state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF)); + //c.and x8, x13 + state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.and x9, x13 + state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC)); + //c.or x8, x14 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC)); + //c.or x9, x15 + state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC)); + //fmv.d.x f24, x8 + state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg)); + //fmv.d.x f25, x9 + state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg)); + //fdiv.d f{dst_lo}, f{dst_lo}, f24 + state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF)); + //fdiv.d f{dst_hi}, f{dst_hi}, f25 + state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF)); + } + + static void v1_FSQRT_R(HANDLER_ARGS) { + isn.dst %= RegisterCountFlt; + //fsqrt.d f{dst_lo}, f{dst_lo} + state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst))); + //fsqrt.d f{dst_hi}, f{dst_hi} + state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst))); + } + + static void v1_CBRANCH(HANDLER_ARGS) { + int reg = isn.dst; + int target = state.registerUsage[reg] + 1; + int shift = isn.getModCond() + ConditionOffset; + int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()); + imm |= (1UL << shift); + if (ConditionOffset > 0 || shift > 0) + imm &= ~(1UL << (shift - 1)); + //x8 = branchMask + emitImm32(state, (int32_t)ConditionMask << shift, Tmp1Reg); + //x{dst} += {imm} + emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg); + //and x8, x8, x{dst} + state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst))); + int32_t targetPos = state.instructionOffsets[target]; + int offset = targetPos - state.codePos; + if (offset >= -256) { //C.BEQZ only has a range of 256B + //c.beqz x8, {offset} + int imm8 = 1; //sign bit is always 1 + int imm21 = offset & 6; //offset[2:1] + int imm5 = (offset >> 5) & 1; //offset[5] + int imm43 = offset & 24; //offset[4:3] + int imm76 = (offset >> 3) & 24; //offset[7:6] + state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5)); + } + else if (offset >= -4096) { //BEQ only has a range of 4KB + //beq x8, x0, offset + int imm12 = 1 << 11; //sign bit is always 1 + int imm105 = offset & 2016; //offset[10:5] + int imm41 = offset & 30; //offset[4:1] + int imm11 = (offset >> 11) & 1; //offset[11] + state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105)); + } + else { + //c.bnez x8, +6 + state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6)); + //j targetPos + emitJump(state, 0, state.codePos, targetPos); + state.codePos += 4; + } + //mark all registers as used + for (unsigned j = 0; j < RegistersCount; ++j) { + state.registerUsage[j] = i; + } + } + + static void v1_CFROUND(HANDLER_ARGS) { + int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4 + if (imm != 0) { +#ifdef __riscv_zbb + //rori x8, x{src}, {imm} + state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm)); +#else + int32_t imml = -imm & 63; + //srli x8, x{src}, {imm} + state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm)); + //slli x9, x{src}, {imml} + state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml)); + //c.or x8, x9 + state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC)); +#endif + //c.andi x8, 12 + state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12)); + } + else { + //and x8, x{src}, 12 + state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12)); + } + //c.add x8, x3 + state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg)); + //c.lw x8, 64(x8) + state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC)); + //fsrm x8 + state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0)); + } + + static void v1_ISTORE(HANDLER_ARGS) { + genAddressRegDst(state, isn); + //sd x{src}, 0(x9) + state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src))); + } + + static void v1_NOP(HANDLER_ARGS) { + } +} + +#include "instruction_weights.hpp" + +namespace { + +#define INST_HANDLE1(x) REPN(&randomx::v1_##x, WT(x)) +#define INST_HANDLE2(x) REPN(&randomx::v2_##x, WT(x)) + + InstructionHandler* opcodeMap1[256] = { + INST_HANDLE1(IADD_RS) + INST_HANDLE1(IADD_M) + INST_HANDLE1(ISUB_R) + INST_HANDLE1(ISUB_M) + INST_HANDLE1(IMUL_R) + INST_HANDLE1(IMUL_M) + INST_HANDLE1(IMULH_R) + INST_HANDLE1(IMULH_M) + INST_HANDLE1(ISMULH_R) + INST_HANDLE1(ISMULH_M) + INST_HANDLE1(IMUL_RCP) + INST_HANDLE1(INEG_R) + INST_HANDLE1(IXOR_R) + INST_HANDLE1(IXOR_M) + INST_HANDLE1(IROR_R) + INST_HANDLE1(IROL_R) + INST_HANDLE1(ISWAP_R) + INST_HANDLE1(FSWAP_R) + INST_HANDLE1(FADD_R) + INST_HANDLE1(FADD_M) + INST_HANDLE1(FSUB_R) + INST_HANDLE1(FSUB_M) + INST_HANDLE1(FSCAL_R) + INST_HANDLE1(FMUL_R) + INST_HANDLE1(FDIV_M) + INST_HANDLE1(FSQRT_R) + INST_HANDLE1(CBRANCH) + INST_HANDLE1(CFROUND) + INST_HANDLE1(ISTORE) + INST_HANDLE1(NOP) + }; +} \ No newline at end of file diff --git a/src/jit_compiler_rv64.hpp b/src/jit_compiler_rv64.hpp new file mode 100644 index 00000000..aaae57e3 --- /dev/null +++ b/src/jit_compiler_rv64.hpp @@ -0,0 +1,69 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +#include +#include +#include +#include "jit_compiler.hpp" + +namespace randomx { + + class Program; + struct ProgramConfiguration; + class SuperscalarProgram; + class Instruction; + + class JitCompilerRV64 { + public: + JitCompilerRV64(); + ~JitCompilerRV64(); + void generateProgram(Program&, ProgramConfiguration&); + void generateProgramLight(Program&, ProgramConfiguration&, uint32_t); + void generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector&); + void generateDatasetInitCode() {} + ProgramFunc* getProgramFunc() { + return (ProgramFunc*)entryProgram; + } + DatasetInitFunc* getDatasetInitFunc() { + return (DatasetInitFunc*)entryDataInit; + } + uint8_t* getCode() { + return state.code; + } + size_t getCodeSize(); + void enableWriting(); + void enableExecution(); + void enableAll(); + private: + CompilerState state; + void* entryDataInit; + void* entryProgram; + }; +} diff --git a/src/jit_compiler_rv64_static.S b/src/jit_compiler_rv64_static.S new file mode 100644 index 00000000..5ecb4815 --- /dev/null +++ b/src/jit_compiler_rv64_static.S @@ -0,0 +1,1235 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define DECL(x) x + +.text +.option rvc + +#include "configuration.h" + +.global DECL(randomx_riscv64_literals) +.global DECL(randomx_riscv64_literals_end) +.global DECL(randomx_riscv64_data_init) +.global DECL(randomx_riscv64_fix_data_call) +.global DECL(randomx_riscv64_prologue) +.global DECL(randomx_riscv64_loop_begin) +.global DECL(randomx_riscv64_data_read) +.global DECL(randomx_riscv64_data_read_light) +.global DECL(randomx_riscv64_fix_loop_call) +.global DECL(randomx_riscv64_spad_store) +.global DECL(randomx_riscv64_spad_store_hardaes) +.global DECL(randomx_riscv64_spad_store_softaes) +.global DECL(randomx_riscv64_loop_end) +.global DECL(randomx_riscv64_fix_continue_loop) +.global DECL(randomx_riscv64_epilogue) +.global DECL(randomx_riscv64_softaes) +.global DECL(randomx_riscv64_program_end) +.global DECL(randomx_riscv64_ssh_init) +.global DECL(randomx_riscv64_ssh_load) +.global DECL(randomx_riscv64_ssh_prefetch) +.global DECL(randomx_riscv64_ssh_end) + +/* The literal pool can fit at most 494 IMUL_RCP literals */ +#if RANDOMX_PROGRAM_SIZE > 494 + #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported. +#endif + +#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1) + +/* shared literal pool: 4 KB */ + /* space for 256 IMUL_RCP literals -2048 */ + /* filled by JIT compiler */ +DECL(randomx_riscv64_literals): +literal_pool: + /* SuperscalarHash constants +0 */ + .dword 6364136223846793005 + .dword 9298411001130361340 + .dword 12065312585734608966 + .dword 9306329213124626780 + .dword 5281919268842080866 + .dword 10536153434571861004 + .dword 3398623926847679864 + .dword 9549104520008361294 + /* CFROUND lookup table +64 */ + .word 0x00000000 /* RTN */ + .word 0x00000002 /* RDN */ + .word 0x00000003 /* RUP */ + .word 0x00000001 /* RTZ */ + /* mask literals +80,+84,+88,+92,+96,+104 */ + .word (RANDOMX_SCRATCHPAD_L1-8) + .word (RANDOMX_SCRATCHPAD_L2-8) + .word (RANDOMX_SCRATCHPAD_L3-64) + .word (RANDOMX_DATASET_BASE_SIZE-64) + .dword 0x80f0000000000000 + .dword 0x00ffffffffffffff +DECL(randomx_riscv64_literals_end): + /* E reg. set masks, +112,+120 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* soft AES table addresses, +128,+136 */ + .dword 0 /* filled by JIT compiler */ + .dword 0 /* filled by JIT compiler */ + /* space for 238 IMUL_RCP literals, +144 */ + .fill 238,8,0 /* filled by JIT compiler */ + +/* ================================= */ +/* Dataset init function entry point */ +/* ================================= */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/return address + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> dataset pointer + x6 -> cache pointer + x7 -> temp/itemNumber + x8-x15 -> SuperscalarHash registers + x16 -> itemNumber + x17 -> endItem + x28-x31 -> temp + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> saved x3 + 16 -> saved x8-x9 + 32 -> caller stack +*/ +DECL(randomx_riscv64_data_init): + addi sp, sp, -32 + /* dataset ptr */ + mv x5, x11 + /* cache->memory */ + ld x6, 0(x10) + /* callee saved registers */ + sd x1, 0(sp) + sd x3, 8(sp) + /* literal pool */ + lla x3, literal_pool + sd x8, 16(sp) + sd x9, 24(sp) + /* startItem */ + mv x16, x12 + /* endItem */ + mv x17, x13 +init_item: + mv x7, x16 +DECL(randomx_riscv64_fix_data_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + sd x8, 0(x5) + sd x9, 8(x5) + sd x10, 16(x5) + sd x11, 24(x5) + sd x12, 32(x5) + sd x13, 40(x5) + sd x14, 48(x5) + sd x15, 56(x5) + addi x5, x5, 64 + addi x16, x16, 1 + bltu x16, x17, init_item + ld x1, 0(sp) + ld x3, 8(sp) + ld x8, 16(sp) + ld x9, 24(sp) + addi sp, sp, 32 + ret + +/* ====================================== */ +/* Program execution function entry point */ +/* ====================================== */ + +/* Register allocation: + ---------------------- + x0 -> zero + x1 -> temp/scratchpad L3 mask + x2 -> stack pointer (sp) + x3 -> literal pool pointer + x5 -> scratchpad pointer + x6 -> dataset/cache pointer + x7 -> temp/next dataset access + x8 -> temp + x9 -> temp + x10 -> scratchpad L1 mask (0x0000000000003ff8) + x11 -> scratchpad L2 mask (0x000000000003fff8) + x12 -> FSCAL_R mask (0x80f0000000000000) + x13 -> E reg. clear mask (0x00ffffffffffffff) + x14 -> E reg. set mask (0x3*00000000******) + x15 -> E reg. set mask (0x3*00000000******) + x16-x23 -> VM registers "r0"-"r7" + x24 -> iteration counter "ic" + x25 -> VM registers "mx", "ma" + x26 -> spAddr0 + x27 -> spAddr1 + x28-x31 -> temp/literals for IMUL_RCP (4x) + + (Note: We avoid using x4 because it breaks debugging with gdb.) + + f0-f7 -> VM registers "f0"-"f3" + f8-f15 -> VM registers "e0"-"e3" + f16-f23 -> VM registers "a0"-"a3" + f24-f25 -> temp + f26-f31 -> literals for IMUL_RCP (6x) + + Stack layout: + ------------------------ + sp+ + 0 -> return address + 8 -> register file ptr + 16 -> saved x3-x4 + 32 -> saved x8-x9 + 48 -> saved x18-x27 + 128 -> saved f8-f9 + 144 -> saved f18-f27 + 224 -> caller stack +*/ + +DECL(randomx_riscv64_prologue): + addi sp, sp, -224 + /* scratchpad pointer */ + mv x5, x12 + /* register file pointer */ + sd x10, 8(sp) + /* callee saved registers */ + sd x3, 16(sp) + sd x8, 32(sp) + sd x9, 40(sp) + sd x18, 48(sp) + sd x19, 56(sp) + sd x20, 64(sp) + sd x21, 72(sp) + sd x22, 80(sp) + sd x23, 88(sp) + sd x24, 96(sp) + sd x25, 104(sp) + sd x26, 112(sp) + sd x27, 120(sp) + fsd f8, 128(sp) + fsd f9, 136(sp) + fsd f18, 144(sp) + fsd f19, 152(sp) + fsd f20, 160(sp) + fsd f21, 168(sp) + fsd f22, 176(sp) + fsd f23, 184(sp) + fsd f24, 192(sp) + fsd f25, 200(sp) + fsd f26, 208(sp) + fsd f27, 216(sp) + /* iteration counter */ + mv x24, x13 + /* return address */ + sd x1, 0(sp) + /* literal pool */ + lla x3, literal_pool + /* load (ma, mx) */ + ld x25, 0(x11) + /* dataset ptr */ + ld x6, 8(x11) + /* load dataset mask */ + lwu x1, 92(x3) + /* zero registers r0-r3, load a0-a1 */ + li x16, 0 + fld f16, 192(x10) + li x17, 0 + fld f17, 200(x10) + srli x7, x25, 32 /* x7 = ma */ + li x18, 0 + fld f18, 208(x10) + mv x27, x7 /* x27 = ma */ + li x19, 0 + fld f19, 216(x10) + /* set dataset read address */ + and x7, x7, x1 + add x7, x7, x6 + /* zero registers r4-r7, load a2-a3 */ + li x20, 0 + fld f20, 224(x10) + li x21, 0 + fld f21, 232(x10) + li x22, 0 + fld f22, 240(x10) + li x23, 0 + fld f23, 248(x10) + /* load L3 mask */ + lwu x1, 88(x3) + /* load scratchpad masks */ + lwu x10, 80(x3) + lwu x11, 84(x3) + /* set spAddr0, spAddr1 */ + and x26, x25, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* FSCAL, E reg. masks */ + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + /* IMUL_RCP literals */ + fld f26, 176(x3) + fld f27, 184(x3) + fld f28, 192(x3) + fld f29, 200(x3) + fld f30, 208(x3) + fld f31, 216(x3) + +.balign 4 +DECL(randomx_riscv64_loop_begin): +loop_begin: + /* mix integer registers */ + ld x8, 0(x26) + ld x9, 8(x26) + ld x30, 16(x26) + ld x31, 24(x26) + xor x16, x16, x8 + ld x8, 32(x26) + xor x17, x17, x9 + ld x9, 40(x26) + xor x18, x18, x30 + ld x30, 48(x26) + xor x19, x19, x31 + ld x31, 56(x26) + xor x20, x20, x8 + lw x8, 0(x27) + xor x21, x21, x9 + lw x9, 4(x27) + xor x22, x22, x30 + lw x30, 8(x27) + xor x23, x23, x31 + lw x31, 12(x27) + /* load F registers */ + fcvt.d.w f0, x8 + lw x8, 16(x27) + fcvt.d.w f1, x9 + lw x9, 20(x27) + fcvt.d.w f2, x30 + lw x30, 24(x27) + fcvt.d.w f3, x31 + lw x31, 28(x27) + fcvt.d.w f4, x8 + lw x8, 32(x27) + fcvt.d.w f5, x9 + lw x9, 36(x27) + fcvt.d.w f6, x30 + lw x30, 40(x27) + fcvt.d.w f7, x31 + lw x31, 44(x27) + /* load E registers */ + fcvt.d.w f8, x8 + lw x8, 48(x27) + fcvt.d.w f9, x9 + lw x9, 52(x27) + fcvt.d.w f10, x30 + lw x30, 56(x27) + fcvt.d.w f11, x31 + lw x31, 60(x27) + fcvt.d.w f12, x8 + fmv.x.d x8, f8 + fcvt.d.w f13, x9 + fmv.x.d x9, f9 + fcvt.d.w f14, x30 + fmv.x.d x30, f10 + fcvt.d.w f15, x31 + fmv.x.d x31, f11 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f8, x8 + fmv.d.x f9, x9 + fmv.d.x f10, x30 + fmv.d.x f11, x31 + fmv.x.d x8, f12 + fmv.x.d x9, f13 + fmv.x.d x30, f14 + fmv.x.d x31, f15 + and x8, x8, x13 + and x9, x9, x13 + or x8, x8, x14 + or x9, x9, x15 + fmv.d.x f12, x8 + fmv.d.x f13, x9 + and x30, x30, x13 + and x31, x31, x13 + or x30, x30, x14 + or x31, x31, x15 + fmv.d.x f14, x30 + fmv.d.x f15, x31 + /* reload clobbered IMUL_RCP regs */ + ld x28, 144(x3) + ld x29, 152(x3) + ld x30, 160(x3) + ld x31, 168(x3) + +DECL(randomx_riscv64_data_read): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset mask */ + lwu x1, 92(x3) + /* zero-extend x8 */ +#ifdef __riscv_zba + zext.w x8, x8 +#else + slli x8, x8, 32 + srli x8, x8, 32 +#endif + /* update "mx" */ + xor x25, x25, x8 + /* read dataset and update registers */ + ld x8, 0(x7) + ld x9, 8(x7) + ld x30, 16(x7) + ld x31, 24(x7) + xor x16, x16, x8 + ld x8, 32(x7) + xor x17, x17, x9 + ld x9, 40(x7) + xor x18, x18, x30 + ld x30, 48(x7) + xor x19, x19, x31 + ld x31, 56(x7) + xor x20, x20, x8 + /* calculate the next dataset address */ + and x7, x25, x1 + xor x21, x21, x9 + add x7, x7, x6 + xor x22, x22, x30 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + xor x23, x23, x31 + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x9, x25, 32 + slli x25, x25, 32 + or x25, x25, x9 +#endif + +DECL(randomx_riscv64_data_read_light): + xor x8, x20, x22 /* JIT compiler will adjust the registers */ + /* load dataset offset */ + lui x9, 0x02000 /* JIT compiler will adjust the immediate */ + addi x9, x9, -64 + /* load dataset mask */ + lwu x1, 92(x3) + /* swap mx <-> ma */ +#ifdef __riscv_zbb + rori x25, x25, 32 +#else + srli x31, x25, 32 + slli x25, x25, 32 + or x25, x25, x31 +#endif + slli x8, x8, 32 + /* update "mx" */ + xor x25, x25, x8 + /* the next dataset item */ + and x7, x25, x1 + srli x7, x7, 6 + add x7, x7, x9 +DECL(randomx_riscv64_fix_loop_call): + jal superscalar_hash /* JIT compiler will adjust the offset */ + xor x16, x16, x8 + xor x17, x17, x9 + xor x18, x18, x10 + xor x19, x19, x11 + xor x20, x20, x12 + xor x21, x21, x13 + xor x22, x22, x14 + xor x23, x23, x15 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_spad_store): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* XOR and store f0,e0 */ + fmv.x.d x8, f0 + fmv.x.d x9, f8 + fmv.x.d x30, f1 + fmv.x.d x31, f9 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 0(x26) + fmv.d.x f0, x8 + sd x30, 8(x26) + fmv.d.x f1, x30 + /* XOR and store f1,e1 */ + fmv.x.d x8, f2 + fmv.x.d x9, f10 + fmv.x.d x30, f3 + fmv.x.d x31, f11 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 16(x26) + fmv.d.x f2, x8 + sd x30, 24(x26) + fmv.d.x f3, x30 + /* XOR and store f2,e2 */ + fmv.x.d x8, f4 + fmv.x.d x9, f12 + fmv.x.d x30, f5 + fmv.x.d x31, f13 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 32(x26) + fmv.d.x f4, x8 + sd x30, 40(x26) + fmv.d.x f5, x30 + /* XOR and store f3,e3 */ + fmv.x.d x8, f6 + fmv.x.d x9, f14 + fmv.x.d x30, f7 + fmv.x.d x31, f15 + xor x8, x8, x9 + xor x30, x30, x31 + sd x8, 48(x26) + fmv.d.x f6, x8 + sd x30, 56(x26) + fmv.d.x f7, x30 + +DECL(randomx_riscv64_spad_store_hardaes): + nop /* not implemented */ + +DECL(randomx_riscv64_spad_store_softaes): + /* store integer registers */ + sd x16, 0(x27) + sd x17, 8(x27) + sd x18, 16(x27) + sd x19, 24(x27) + sd x20, 32(x27) + sd x21, 40(x27) + sd x22, 48(x27) + sd x23, 56(x27) + /* process f0 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f0 + fmv.x.d x31, f1 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 0(x26) + fmv.d.x f0, x30 + sd x31, 8(x26) + fmv.d.x f1, x31 + /* process f1 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f2 + fmv.x.d x31, f3 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 16(x26) + fmv.d.x f2, x30 + sd x31, 24(x26) + fmv.d.x f3, x31 + /* process f2 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f4 + fmv.x.d x31, f5 + jal softaes_enc + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_enc + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_enc + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_enc + sd x30, 32(x26) + fmv.d.x f4, x30 + sd x31, 40(x26) + fmv.d.x f5, x31 + /* process f3 with 4 AES rounds */ + fmv.x.d x8, f8 + fmv.x.d x10, f9 + fmv.x.d x30, f6 + fmv.x.d x31, f7 + jal softaes_dec + fmv.x.d x8, f10 + fmv.x.d x10, f11 + jal softaes_dec + fmv.x.d x8, f12 + fmv.x.d x10, f13 + jal softaes_dec + fmv.x.d x8, f14 + fmv.x.d x10, f15 + jal softaes_dec + sd x30, 48(x26) + fmv.d.x f6, x30 + sd x31, 56(x26) + fmv.d.x f7, x31 + /* restore clobbered registers */ + lwu x10, 80(x3) + lwu x11, 84(x3) + ld x12, 96(x3) + ld x13, 104(x3) + ld x14, 112(x3) + ld x15, 120(x3) + +DECL(randomx_riscv64_loop_end): + xor x26, x16, x18 /* JIT compiler will adjust the registers */ + /* load L3 mask */ + lwu x1, 88(x3) + addi x24, x24, -1 + srli x27, x26, 32 + /* set spAddr0, spAddr1 */ + and x26, x26, x1 + and x27, x27, x1 + add x26, x26, x5 + add x27, x27, x5 + /* align L3 mask */ + addi x1, x1, 56 + /* conditional branch doesn't have sufficient range */ + j condition_check +DECL(randomx_riscv64_fix_continue_loop): +continue_loop: + .word 0 /* JIT compiler will write a jump to loop_begin */ +condition_check: + bnez x24, continue_loop + +DECL(randomx_riscv64_epilogue): + /* restore callee saved registers */ + ld x10, 8(sp) + ld x1, 0(sp) + ld x3, 16(sp) + ld x8, 32(sp) + ld x9, 40(sp) + ld x24, 96(sp) + ld x25, 104(sp) + ld x26, 112(sp) + ld x27, 120(sp) + fld f18, 144(sp) + fld f19, 152(sp) + fld f20, 160(sp) + fld f21, 168(sp) + fld f22, 176(sp) + fld f23, 184(sp) + fld f24, 192(sp) + fld f25, 200(sp) + fld f26, 208(sp) + fld f27, 216(sp) + /* save VM registers */ + sd x16, 0(x10) + sd x17, 8(x10) + sd x18, 16(x10) + sd x19, 24(x10) + sd x20, 32(x10) + sd x21, 40(x10) + sd x22, 48(x10) + sd x23, 56(x10) + fsd f0, 64(x10) + fsd f1, 72(x10) + fsd f2, 80(x10) + fsd f3, 88(x10) + fsd f4, 96(x10) + fsd f5, 104(x10) + fsd f6, 112(x10) + fsd f7, 120(x10) + fsd f8, 128(x10) + fsd f9, 136(x10) + fsd f10, 144(x10) + fsd f11, 152(x10) + fsd f12, 160(x10) + fsd f13, 168(x10) + fsd f14, 176(x10) + fsd f15, 184(x10) + /* restore callee saved registers */ + ld x18, 48(sp) + ld x19, 56(sp) + ld x20, 64(sp) + ld x21, 72(sp) + ld x22, 80(sp) + ld x23, 88(sp) + fld f8, 128(sp) + fld f9, 136(sp) + /* restore stack pointer */ + addi sp, sp, 224 + /* return */ + ret + +/* + Soft AES subroutines + in: + x3 = literal pool + x8, x10 = round key + x30, x31 = plaintext + out: + x30, x31 = ciphertext + clobbers: + x8-x11 (limbs) + x12-x13 (LUTs) + x14-x15 (temp) +*/ +DECL(randomx_riscv64_softaes): +softaes_enc: + /* enc. lookup table */ + ld x13, 128(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + zext.b x14, x30 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 3 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 5 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 7 */ + zext.b x15, x30 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 9 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 11 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 13 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 15 */ + zext.b x15, x31 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x30, x30, x15 + + ret + +softaes_dec: + /* dec. lookup table */ + ld x13, 136(x3) + + /* load the round key into x8, x9, x10, x11 */ + srli x9, x8, 32 + srli x11, x10, 32 +#ifdef __riscv_zba + zext.w x8, x8 + zext.w x10, x10 +#else + slli x8, x8, 32 + slli x10, x10, 32 + srli x8, x8, 32 + srli x10, x10, 32 +#endif + + /* byte 0 */ + zext.b x14, x30 + srli x30, x30, 8 + addi x12, x13, -2048 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, -2048(x14) + + /* byte 1 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 2 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 3 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 4 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 5 */ + zext.b x15, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + /* byte 6 */ + zext.b x14, x30 + srli x30, x30, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x10, x10, x15 + + /* byte 7 */ + zext.b x15, x30 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 8 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 9 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x10, x10, x14 + + /* byte 10 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x11, x11, x15 + + /* byte 11 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x8, x8, x14 + + /* byte 12 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x12 +#else + slli x14, x14, 2 + add x14, x14, x12 +#endif + lwu x14, 0(x14) + xor x9, x9, x15 + + /* byte 13 */ + zext.b x15, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x15, x15, x12 +#else + slli x15, x15, 2 + add x15, x15, x12 +#endif + lwu x15, 1024(x15) + xor x11, x11, x14 + + /* byte 14 */ + zext.b x14, x31 + srli x31, x31, 8 +#ifdef __riscv_zba + sh2add x14, x14, x13 +#else + slli x14, x14, 2 + add x14, x14, x13 +#endif + lwu x14, 0(x14) + xor x8, x8, x15 + + /* byte 15 */ + zext.b x15, x31 +#ifdef __riscv_zba + sh2add x15, x15, x13 +#else + slli x15, x15, 2 + add x15, x15, x13 +#endif + lwu x15, 1024(x15) + xor x9, x9, x14 + + slli x11, x11, 32 + slli x9, x9, 32 + or x30, x8, x9 + or x31, x10, x11 + xor x31, x31, x15 + + ret + +DECL(randomx_riscv64_program_end): + nop + + +/* literal pool for SuperscalarHash */ + /* space for remaining IMUL_RCP literals */ +ssh_literal_pool: + /* space for 256 IMUL_RCP literals */ + .fill 256,8,0 + +/* + SuperscalarHash subroutine + in: + x3 = literal pool + x6 = cache + x7 = itemNumber + out: + x8-x15 = 64-byte hash + clobbers: + x7, x28-x31 +*/ +DECL(randomx_riscv64_ssh_init): +superscalar_hash: + ld x30, 0(x3) /* superscalarMul0 */ + addi x8, x7, 1 + ld x9, 8(x3) + li x31, RANDOMX_CACHE_MASK + ld x10, 16(x3) + ld x11, 24(x3) + mul x8, x8, x30 + ld x12, 32(x3) + ld x13, 40(x3) + lla x30, ssh_literal_pool + ld x14, 48(x3) + and x7, x7, x31 + ld x15, 56(x3) + slli x7, x7, 6 + xor x9, x9, x8 + add x7, x7, x6 + xor x10, x10, x8 + /* load the first IMUL_RCP literal */ + ld x31, 2040(x30) + xor x11, x11, x8 + xor x12, x12, x8 + xor x13, x13, x8 + xor x14, x14, x8 + xor x15, x15, x8 + +DECL(randomx_riscv64_ssh_load): + ld x28, 0(x7) + ld x29, 8(x7) + xor x8, x8, x28 + ld x28, 16(x7) + xor x9, x9, x29 + ld x29, 24(x7) + xor x10, x10, x28 + ld x28, 32(x7) + xor x11, x11, x29 + ld x29, 40(x7) + xor x12, x12, x28 + ld x28, 48(x7) + xor x13, x13, x29 + ld x29, 56(x7) + xor x14, x14, x28 + li x7, RANDOMX_CACHE_MASK + xor x15, x15, x29 + +DECL(randomx_riscv64_ssh_prefetch): + and x7, x8, x7 /* JIT compiler will adjust the register */ + slli x7, x7, 6 + add x7, x7, x6 + /* prefetch - doesn't seem to have any effect */ + /* ld x0, 0(x7) */ + +DECL(randomx_riscv64_ssh_end): + nop diff --git a/src/jit_compiler_rv64_static.hpp b/src/jit_compiler_rv64_static.hpp new file mode 100644 index 00000000..656623c7 --- /dev/null +++ b/src/jit_compiler_rv64_static.hpp @@ -0,0 +1,53 @@ +/* +Copyright (c) 2023 tevador + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#pragma once + +extern "C" { + void randomx_riscv64_literals(); + void randomx_riscv64_literals_end(); + void randomx_riscv64_data_init(); + void randomx_riscv64_fix_data_call(); + void randomx_riscv64_prologue(); + void randomx_riscv64_loop_begin(); + void randomx_riscv64_data_read(); + void randomx_riscv64_data_read_light(); + void randomx_riscv64_fix_loop_call(); + void randomx_riscv64_spad_store(); + void randomx_riscv64_spad_store_hardaes(); + void randomx_riscv64_spad_store_softaes(); + void randomx_riscv64_loop_end(); + void randomx_riscv64_fix_continue_loop(); + void randomx_riscv64_epilogue(); + void randomx_riscv64_softaes(); + void randomx_riscv64_program_end(); + void randomx_riscv64_ssh_init(); + void randomx_riscv64_ssh_load(); + void randomx_riscv64_ssh_prefetch(); + void randomx_riscv64_ssh_end(); +} diff --git a/src/tests/riscv64_zba.s b/src/tests/riscv64_zba.s new file mode 100644 index 00000000..e1947e7a --- /dev/null +++ b/src/tests/riscv64_zba.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zba extension is present */ + +.text +.global main + +main: + sh1add x6, x6, x7 + li x10, 0 + ret diff --git a/src/tests/riscv64_zbb.s b/src/tests/riscv64_zbb.s new file mode 100644 index 00000000..d922043f --- /dev/null +++ b/src/tests/riscv64_zbb.s @@ -0,0 +1,9 @@ +/* RISC-V - test if the Zbb extension is present */ + +.text +.global main + +main: + ror x6, x6, x7 + li x10, 0 + ret From f72101aa2c54cf15dba8f7aebb24fa5ce2961de1 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Thu, 19 Oct 2023 15:16:47 +0200 Subject: [PATCH 08/13] ARM64 JIT: don't use `x18` register --- src/jit_compiler_a64.cpp | 54 +++++++++---------- src/jit_compiler_a64_static.S | 98 +++++++++++++++++------------------ 2 files changed, 75 insertions(+), 77 deletions(-) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 91e31d64..0c557662 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -130,8 +130,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -149,16 +149,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos; emit32(ARMV8A::B | (offset / 4), code, codePos); - // and w18, w18, CacheLineAlignMask + // and w20, w20, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64)); - emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); + emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos); // and w10, w10, CacheLineAlignMask codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64)); @@ -181,8 +181,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration // and w16, w10, ScratchpadL3Mask64 emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); - // and w17, w18, ScratchpadL3Mask64 - emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); + // and w17, w20, ScratchpadL3Mask64 + emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos); codePos = PrologueSize; literalPos = ImulRcpLiteralsEnd; @@ -200,8 +200,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration } // Update spMix2 - // eor w18, config.readReg2, config.readReg3 - emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); + // eor w20, config.readReg2, config.readReg3 + emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos); // Jump back to the main loop const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos; @@ -434,7 +434,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm, } else { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMovImmediate(tmp_reg, imm, code, k); // add dst, src, tmp_reg @@ -483,7 +483,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co uint32_t k = codePos; uint32_t imm = instr.getImm32(); - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 19; imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1); emitAddImmediate(tmp_reg, src, imm, code, k); @@ -537,7 +537,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // add dst, dst, tmp_reg @@ -575,7 +575,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -594,7 +594,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -612,7 +612,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // sub dst, dst, tmp_reg @@ -643,7 +643,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // umulh dst, dst, tmp_reg @@ -674,7 +674,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // smulh dst, dst, tmp_reg @@ -692,7 +692,7 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; constexpr uint64_t N = 1ULL << 63; @@ -711,9 +711,9 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) literalPos -= sizeof(uint64_t); *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); - if (literal_id < 13) + if (literal_id < 12) { - static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 }; + static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 }; // mul dst, dst, literal_reg emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k); @@ -751,7 +751,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos) if (src == dst) { - src = 18; + src = 20; emitMovImmediate(src, instr.getImm32(), code, k); } @@ -769,7 +769,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emitMemLoad(dst, src, instr, code, k); // eor dst, dst, tmp_reg @@ -807,7 +807,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos) if (src != dst) { - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; // sub tmp_reg, xzr, src emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k); @@ -835,7 +835,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos) uint32_t k = codePos; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k); emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k); emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k); @@ -984,7 +984,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; constexpr uint32_t fpcr_tmp_reg = 8; // ror tmp_reg, src, imm @@ -1008,7 +1008,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos) const uint32_t src = IntRegMap[instr.src]; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint32_t tmp_reg = 18; + constexpr uint32_t tmp_reg = 20; uint32_t imm = instr.getImm32(); diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S index 4886fcf3..bc146133 100644 --- a/src/jit_compiler_a64_static.S +++ b/src/jit_compiler_a64_static.S @@ -74,9 +74,9 @@ # x15 -> "r7" # x16 -> spAddr0 # x17 -> spAddr1 -# x18 -> temporary +# x18 -> unused (platform register, don't touch it) # x19 -> temporary -# x20 -> literal for IMUL_RCP +# x20 -> temporary # x21 -> literal for IMUL_RCP # x22 -> literal for IMUL_RCP # x23 -> literal for IMUL_RCP @@ -111,7 +111,7 @@ DECL(randomx_program_aarch64): # Save callee-saved registers sub sp, sp, 192 stp x16, x17, [sp] - stp x18, x19, [sp, 16] + str x19, [sp, 16] stp x20, x21, [sp, 32] stp x22, x23, [sp, 48] stp x24, x25, [sp, 64] @@ -166,7 +166,6 @@ DECL(randomx_program_aarch64): # Read literals ldr x0, literal_x0 ldr x11, literal_x11 - ldr x20, literal_x20 ldr x21, literal_x21 ldr x22, literal_x22 ldr x23, literal_x23 @@ -198,11 +197,11 @@ DECL(randomx_program_aarch64): DECL(randomx_program_aarch64_main_loop): # spAddr0 = spMix1 & ScratchpadL3Mask64; # spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64; - lsr x18, x10, 32 + lsr x20, x10, 32 # Actual mask will be inserted by JIT compiler and w16, w10, 1 - and w17, w18, 1 + and w17, w20, 1 # x16 = scratchpad + spAddr0 # x17 = scratchpad + spAddr1 @@ -210,31 +209,31 @@ DECL(randomx_program_aarch64_main_loop): add x17, x17, x2 # xor integer registers with scratchpad data (spAddr0) - ldp x18, x19, [x16] - eor x4, x4, x18 + ldp x20, x19, [x16] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x16, 16] - eor x6, x6, x18 + ldp x20, x19, [x16, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x16, 32] - eor x12, x12, x18 + ldp x20, x19, [x16, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x16, 48] - eor x14, x14, x18 + ldp x20, x19, [x16, 48] + eor x14, x14, x20 eor x15, x15, x19 # Load group F registers (spAddr1) - ldpsw x18, x19, [x17] - ins v16.d[0], x18 + ldpsw x20, x19, [x17] + ins v16.d[0], x20 ins v16.d[1], x19 - ldpsw x18, x19, [x17, 8] - ins v17.d[0], x18 + ldpsw x20, x19, [x17, 8] + ins v17.d[0], x20 ins v17.d[1], x19 - ldpsw x18, x19, [x17, 16] - ins v18.d[0], x18 + ldpsw x20, x19, [x17, 16] + ins v18.d[0], x20 ins v18.d[1], x19 - ldpsw x18, x19, [x17, 24] - ins v19.d[0], x18 + ldpsw x20, x19, [x17, 24] + ins v19.d[0], x20 ins v19.d[1], x19 scvtf v16.2d, v16.2d scvtf v17.2d, v17.2d @@ -242,17 +241,17 @@ DECL(randomx_program_aarch64_main_loop): scvtf v19.2d, v19.2d # Load group E registers (spAddr1) - ldpsw x18, x19, [x17, 32] - ins v20.d[0], x18 + ldpsw x20, x19, [x17, 32] + ins v20.d[0], x20 ins v20.d[1], x19 - ldpsw x18, x19, [x17, 40] - ins v21.d[0], x18 + ldpsw x20, x19, [x17, 40] + ins v21.d[0], x20 ins v21.d[1], x19 - ldpsw x18, x19, [x17, 48] - ins v22.d[0], x18 + ldpsw x20, x19, [x17, 48] + ins v22.d[0], x20 ins v22.d[1], x19 - ldpsw x18, x19, [x17, 56] - ins v23.d[0], x18 + ldpsw x20, x19, [x17, 56] + ins v23.d[0], x20 ins v23.d[1], x19 scvtf v20.2d, v20.2d scvtf v21.2d, v21.2d @@ -276,7 +275,6 @@ DECL(randomx_program_aarch64_vm_instructions): literal_x0: .fill 1,8,0 literal_x11: .fill 1,8,0 -literal_x20: .fill 1,8,0 literal_x21: .fill 1,8,0 literal_x22: .fill 1,8,0 literal_x23: .fill 1,8,0 @@ -312,17 +310,17 @@ DECL(randomx_program_aarch64_vm_instructions_end): lsr x10, x9, 32 # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # Calculate dataset pointer for dataset prefetch - mov w18, w9 + mov w20, w9 DECL(randomx_program_aarch64_cacheline_align_mask1): # Actual mask will be inserted by JIT compiler - and x18, x18, 1 - add x18, x18, x1 + and x20, x20, 1 + add x20, x20, x1 # Prefetch dataset data - prfm pldl2strm, [x18] + prfm pldl2strm, [x20] # mx <-> ma ror x9, x9, 32 @@ -335,17 +333,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2): DECL(randomx_program_aarch64_xor_with_dataset_line): rx_program_xor_with_dataset_line: # xor integer registers with dataset data - ldp x18, x19, [x10] - eor x4, x4, x18 + ldp x20, x19, [x10] + eor x4, x4, x20 eor x5, x5, x19 - ldp x18, x19, [x10, 16] - eor x6, x6, x18 + ldp x20, x19, [x10, 16] + eor x6, x6, x20 eor x7, x7, x19 - ldp x18, x19, [x10, 32] - eor x12, x12, x18 + ldp x20, x19, [x10, 32] + eor x12, x12, x20 eor x13, x13, x19 - ldp x18, x19, [x10, 48] - eor x14, x14, x18 + ldp x20, x19, [x10, 48] + eor x14, x14, x20 eor x15, x15, x19 DECL(randomx_program_aarch64_update_spMix1): @@ -388,7 +386,7 @@ DECL(randomx_program_aarch64_update_spMix1): # Restore callee-saved registers ldp x16, x17, [sp] - ldp x18, x19, [sp, 16] + ldr x19, [sp, 16] ldp x20, x21, [sp, 32] ldp x22, x23, [sp, 48] ldp x24, x25, [sp, 64] @@ -409,7 +407,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light): stp x2, x30, [sp, 80] # mx ^= r[readReg2] ^ r[readReg3]; - eor x9, x9, x18 + eor x9, x9, x20 # mx <-> ma ror x9, x9, 32 @@ -451,8 +449,8 @@ DECL(randomx_program_aarch64_light_dataset_offset): # x3 -> end item DECL(randomx_init_dataset_aarch64): - # Save x30 (return address) - str x30, [sp, -16]! + # Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address) + stp x20, x30, [sp, -16]! # Load pointer to cache memory ldr x0, [x0] @@ -464,8 +462,8 @@ DECL(randomx_init_dataset_aarch64_main_loop): cmp x2, x3 bne DECL(randomx_init_dataset_aarch64_main_loop) - # Restore x30 (return address) - ldr x30, [sp], 16 + # Restore x20 and x30 + ldp x20, x30, [sp], 16 ret From 8f91d31b8b055bbece3f88b9d88cfe8c9cfc8133 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 20 Oct 2023 09:04:35 +0200 Subject: [PATCH 09/13] Fixed UB in ARM64 JIT compiler Fixed unaligned memory writes --- src/jit_compiler_a64.cpp | 3 ++- src/jit_compiler_a64.hpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 0c557662..75ea8ccd 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -709,7 +709,8 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); literalPos -= sizeof(uint64_t); - *(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor); + const uint64_t randomx_reciprocal = (q << shift) + ((r << shift) / divisor); + memcpy(code + literalPos, &randomx_reciprocal, sizeof(randomx_reciprocal)); if (literal_id < 12) { diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp index a02824ff..f8484c08 100644 --- a/src/jit_compiler_a64.hpp +++ b/src/jit_compiler_a64.hpp @@ -81,7 +81,7 @@ namespace randomx { static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos) { - *(uint64_t*)(code + codePos) = val; + memcpy(code + codePos, &val, sizeof(val)); codePos += sizeof(val); } From 5c49ab12a071df4d0fd4b5f0d91c4c0c3180b74d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 20 Oct 2023 10:54:25 +0200 Subject: [PATCH 10/13] Optimized randomx_reciprocal Also limited it to 32 bit because it's supposed to work only with 32-bit values, according to the specs. --- src/assembly_generator_x86.cpp | 2 +- src/bytecode_machine.cpp | 2 +- src/jit_compiler_a64.cpp | 19 ++++--------------- src/jit_compiler_rv64.cpp | 2 +- src/jit_compiler_x86.cpp | 2 +- src/reciprocal.c | 34 +++++++++++++--------------------- src/reciprocal.h | 4 ++-- src/tests/perf-simulation.cpp | 2 +- 8 files changed, 24 insertions(+), 43 deletions(-) diff --git a/src/assembly_generator_x86.cpp b/src/assembly_generator_x86.cpp index e7e5258b..1ce31dd5 100644 --- a/src/assembly_generator_x86.cpp +++ b/src/assembly_generator_x86.cpp @@ -445,7 +445,7 @@ namespace randomx { } void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl; diff --git a/src/bytecode_machine.cpp b/src/bytecode_machine.cpp index 7d8e902d..1d00d095 100644 --- a/src/bytecode_machine.cpp +++ b/src/bytecode_machine.cpp @@ -243,7 +243,7 @@ namespace randomx { } if (opcode < ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { auto dst = instr.dst % RegistersCount; ibc.type = InstructionType::IMUL_R; diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp index 75ea8ccd..5be8f6e4 100644 --- a/src/jit_compiler_a64.cpp +++ b/src/jit_compiler_a64.cpp @@ -686,7 +686,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos) void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) { - const uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (isZeroOrPowerOf2(divisor)) return; @@ -695,22 +695,11 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos) constexpr uint32_t tmp_reg = 20; const uint32_t dst = IntRegMap[instr.dst]; - constexpr uint64_t N = 1ULL << 63; - const uint64_t q = N / divisor; - const uint64_t r = N % divisor; -#ifdef __GNUC__ - const uint64_t shift = 64 - __builtin_clzll(divisor); -#else - uint64_t shift = 32; - for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1) - --shift; -#endif - const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t); - literalPos -= sizeof(uint64_t); - const uint64_t randomx_reciprocal = (q << shift) + ((r << shift) / divisor); - memcpy(code + literalPos, &randomx_reciprocal, sizeof(randomx_reciprocal)); + + const uint64_t reciprocal = randomx_reciprocal_fast(divisor); + memcpy(code + literalPos, &reciprocal, sizeof(reciprocal)); if (literal_id < 12) { diff --git a/src/jit_compiler_rv64.cpp b/src/jit_compiler_rv64.cpp index 301c294c..6f0842e5 100644 --- a/src/jit_compiler_rv64.cpp +++ b/src/jit_compiler_rv64.cpp @@ -776,7 +776,7 @@ namespace randomx { } static void v1_IMUL_RCP(HANDLER_ARGS) { - uint64_t divisor = isn.getImm32(); + const uint32_t divisor = isn.getImm32(); if (!isZeroOrPowerOf2(divisor)) { state.registerUsage[isn.dst] = i; if (state.rcpCount < 4) { diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp index 96c6492f..785ce5f5 100644 --- a/src/jit_compiler_x86.cpp +++ b/src/jit_compiler_x86.cpp @@ -618,7 +618,7 @@ namespace randomx { } void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { registerUsage[instr.dst] = i; emit(MOV_RAX_I); diff --git a/src/reciprocal.c b/src/reciprocal.c index 22620f53..074d1846 100644 --- a/src/reciprocal.c +++ b/src/reciprocal.c @@ -44,36 +44,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ret */ -uint64_t randomx_reciprocal(uint64_t divisor) { +uint64_t randomx_reciprocal(uint32_t divisor) { assert(divisor != 0); const uint64_t p2exp63 = 1ULL << 63; + const uint64_t q = p2exp63 / divisor; + const uint64_t r = p2exp63 % divisor; + +#ifdef __GNUC__ + const uint32_t shift = 64 - __builtin_clzll(divisor); +#else + uint32_t shift = 32; + for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1) + --shift; +#endif - uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor; - - unsigned bsr = 0; //highest set bit in divisor - - for (uint64_t bit = divisor; bit > 0; bit >>= 1) - bsr++; - - for (unsigned shift = 0; shift < bsr; shift++) { - if (remainder >= divisor - remainder) { - quotient = quotient * 2 + 1; - remainder = remainder * 2 - divisor; - } - else { - quotient = quotient * 2; - remainder = remainder * 2; - } - } - - return quotient; + return (q << shift) + ((r << shift) / divisor); } #if !RANDOMX_HAVE_FAST_RECIPROCAL -uint64_t randomx_reciprocal_fast(uint64_t divisor) { +uint64_t randomx_reciprocal_fast(uint32_t divisor) { return randomx_reciprocal(divisor); } diff --git a/src/reciprocal.h b/src/reciprocal.h index 8858df2b..90bd9b6b 100644 --- a/src/reciprocal.h +++ b/src/reciprocal.h @@ -40,8 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. extern "C" { #endif -uint64_t randomx_reciprocal(uint64_t); -uint64_t randomx_reciprocal_fast(uint64_t); +uint64_t randomx_reciprocal(uint32_t); +uint64_t randomx_reciprocal_fast(uint32_t); #if defined(__cplusplus) } diff --git a/src/tests/perf-simulation.cpp b/src/tests/perf-simulation.cpp index 1068a40e..27f34d8c 100644 --- a/src/tests/perf-simulation.cpp +++ b/src/tests/perf-simulation.cpp @@ -477,7 +477,7 @@ int analyze(randomx::Program& p) { } if (opcode < randomx::ceil_IMUL_RCP) { - uint64_t divisor = instr.getImm32(); + const uint32_t divisor = instr.getImm32(); if (!randomx::isZeroOrPowerOf2(divisor)) { instr.dst = instr.dst % randomx::RegistersCount; instr.opcode |= DST_INT; From 06a7cc1c3346609ebce92f91811c2974df08f474 Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 20 Oct 2023 16:16:51 +0200 Subject: [PATCH 11/13] Update README and benchmark version --- README.md | 5 +++-- src/tests/benchmark.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4c1dabb6..2c9bdd31 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ RandomX is written in C++11 and builds a static library with a C API provided by ### Linux -Build dependencies: `cmake` (minimum 2.8.7) and `gcc` (minimum version 4.8, but version 7+ is recommended). +Build dependencies: `cmake` (minimum 3.5) and `gcc` (minimum version 4.8, but version 7+ is recommended). To build optimized binaries for your machine, run: ``` @@ -82,7 +82,7 @@ Intel Core i7-8550U|16G DDR4-2400|Windows 10|hw|200 (4T)|1700 (4T)|350 (8T)| Intel Core i3-3220|4G DDR3-1333|Ubuntu 16.04|soft|42 (4T)|510 (4T)|150 (4T)| Raspberry Pi 3|1G LPDDR2|Ubuntu 16.04|soft|3.5 (4T)|-|20 (4T)| -Note that RandomX currently includes a JIT compiler for x86-64 and ARM64. Other architectures have to use the portable interpreter, which is much slower. +Note that RandomX currently includes a JIT compiler for x86-64, ARM64 and RISCV64. Other architectures have to use the portable interpreter, which is much slower. ### GPU performance @@ -129,6 +129,7 @@ The reference implementation has been validated on the following platforms: * ARMv7+VFPv3 (32-bit, little-endian) * ARMv8 (64-bit, little-endian) * PPC64 (64-bit, big-endian) +* RISCV64 (64-bit, little-endian) ### Can FPGAs mine RandomX? diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index d25d0c2c..627b0d42 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -180,7 +180,7 @@ int main(int argc, char** argv) { store32(&seed, seedValue); - std::cout << "RandomX benchmark v1.1.12" << std::endl; + std::cout << "RandomX benchmark v1.2.0" << std::endl; if (help) { printUsage(argv[0]); From e895d451a3dff110dc10e378d31e3ea507a9006c Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 20 Oct 2023 19:29:35 +0200 Subject: [PATCH 12/13] Avoid `zext.b` --- src/jit_compiler_rv64_static.S | 64 +++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/jit_compiler_rv64_static.S b/src/jit_compiler_rv64_static.S index 5ecb4815..240bbf5f 100644 --- a/src/jit_compiler_rv64_static.S +++ b/src/jit_compiler_rv64_static.S @@ -745,7 +745,7 @@ softaes_enc: #endif /* byte 0 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 addi x12, x13, -2048 #ifdef __riscv_zba @@ -757,7 +757,7 @@ softaes_enc: lwu x14, -2048(x14) /* byte 1 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -769,7 +769,7 @@ softaes_enc: xor x8, x8, x14 /* byte 2 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -781,7 +781,7 @@ softaes_enc: xor x11, x11, x15 /* byte 3 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x13 @@ -793,7 +793,7 @@ softaes_enc: xor x10, x10, x14 /* byte 4 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -805,7 +805,7 @@ softaes_enc: xor x9, x9, x15 /* byte 5 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -817,7 +817,7 @@ softaes_enc: xor x9, x9, x14 /* byte 6 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -829,7 +829,7 @@ softaes_enc: xor x8, x8, x15 /* byte 7 */ - zext.b x15, x30 + andi x15, x30, 255 #ifdef __riscv_zba sh2add x15, x15, x13 #else @@ -840,7 +840,7 @@ softaes_enc: xor x11, x11, x14 /* byte 8 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -852,7 +852,7 @@ softaes_enc: xor x10, x10, x15 /* byte 9 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -864,7 +864,7 @@ softaes_enc: xor x10, x10, x14 /* byte 10 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -876,7 +876,7 @@ softaes_enc: xor x9, x9, x15 /* byte 11 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x13 @@ -888,7 +888,7 @@ softaes_enc: xor x8, x8, x14 /* byte 12 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -900,7 +900,7 @@ softaes_enc: xor x11, x11, x15 /* byte 13 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -912,7 +912,7 @@ softaes_enc: xor x11, x11, x14 /* byte 14 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -924,7 +924,7 @@ softaes_enc: xor x10, x10, x15 /* byte 15 */ - zext.b x15, x31 + andi x15, x31, 255 #ifdef __riscv_zba sh2add x15, x15, x13 #else @@ -960,7 +960,7 @@ softaes_dec: #endif /* byte 0 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 addi x12, x13, -2048 #ifdef __riscv_zba @@ -972,7 +972,7 @@ softaes_dec: lwu x14, -2048(x14) /* byte 1 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -984,7 +984,7 @@ softaes_dec: xor x8, x8, x14 /* byte 2 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -996,7 +996,7 @@ softaes_dec: xor x9, x9, x15 /* byte 3 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x13 @@ -1008,7 +1008,7 @@ softaes_dec: xor x10, x10, x14 /* byte 4 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -1020,7 +1020,7 @@ softaes_dec: xor x11, x11, x15 /* byte 5 */ - zext.b x15, x30 + andi x15, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -1032,7 +1032,7 @@ softaes_dec: xor x9, x9, x14 /* byte 6 */ - zext.b x14, x30 + andi x14, x30, 255 srli x30, x30, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -1044,7 +1044,7 @@ softaes_dec: xor x10, x10, x15 /* byte 7 */ - zext.b x15, x30 + andi x15, x30, 255 #ifdef __riscv_zba sh2add x15, x15, x13 #else @@ -1055,7 +1055,7 @@ softaes_dec: xor x11, x11, x14 /* byte 8 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -1067,7 +1067,7 @@ softaes_dec: xor x8, x8, x15 /* byte 9 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -1079,7 +1079,7 @@ softaes_dec: xor x10, x10, x14 /* byte 10 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -1091,7 +1091,7 @@ softaes_dec: xor x11, x11, x15 /* byte 11 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x13 @@ -1103,7 +1103,7 @@ softaes_dec: xor x8, x8, x14 /* byte 12 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x12 @@ -1115,7 +1115,7 @@ softaes_dec: xor x9, x9, x15 /* byte 13 */ - zext.b x15, x31 + andi x15, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x15, x15, x12 @@ -1127,7 +1127,7 @@ softaes_dec: xor x11, x11, x14 /* byte 14 */ - zext.b x14, x31 + andi x14, x31, 255 srli x31, x31, 8 #ifdef __riscv_zba sh2add x14, x14, x13 @@ -1139,7 +1139,7 @@ softaes_dec: xor x8, x8, x15 /* byte 15 */ - zext.b x15, x31 + andi x15, x31, 255 #ifdef __riscv_zba sh2add x15, x15, x13 #else From 102f8acf90a7649ada410de5499a7ec62e49e1da Mon Sep 17 00:00:00 2001 From: tevador Date: Fri, 20 Oct 2023 20:42:55 +0200 Subject: [PATCH 13/13] bump benchmark version to 1.2.1 --- src/tests/benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp index 627b0d42..148521a5 100644 --- a/src/tests/benchmark.cpp +++ b/src/tests/benchmark.cpp @@ -180,7 +180,7 @@ int main(int argc, char** argv) { store32(&seed, seedValue); - std::cout << "RandomX benchmark v1.2.0" << std::endl; + std::cout << "RandomX benchmark v1.2.1" << std::endl; if (help) { printUsage(argv[0]);