From db273f267142b062442c034ea367fd81901ba926 Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Sat, 22 Apr 2023 19:14:46 +0200
Subject: [PATCH 01/13] introduce RandomX v2 with double-hashing

---
 src/randomx.cpp                 | 11 +++++++++++
 src/randomx.h                   | 11 +++++++++++
 src/tests/benchmark.cpp         | 32 +++++++++++++++++++++++++-------
 src/tests/tests.cpp             | 24 ++++++++++++++++++++++++
 vcxproj/randomx.vcxproj         |  2 +-
 vcxproj/randomx.vcxproj.filters |  2 +-
 6 files changed, 73 insertions(+), 9 deletions(-)
diff --git a/src/randomx.cpp b/src/randomx.cpp
index 7daaa46d..537dde2e 100644
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@@ -400,4 +400,15 @@ extern "C" {
 		machine->run(machine->tempHash);
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
 	}
+
+	void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out) {
+		assert(inputSize == 0 || input != nullptr);
+		assert(v1_in != nullptr);
+		assert(v2_out != nullptr);
+		blake2b_state state;
+		blake2b_init(&state, RANDOMX_HASH_SIZE);
+		blake2b_update(&state, input, inputSize);
+		blake2b_update(&state, v1_in, RANDOMX_HASH_SIZE);
+		blake2b_final(&state, v2_out, RANDOMX_HASH_SIZE);
+	}
 }
diff --git a/src/randomx.h b/src/randomx.h
index 64d18068..d7e2d998 100644
--- a/src/randomx.h
+++ b/src/randomx.h
@@ -260,6 +260,17 @@ RANDOMX_EXPORT void randomx_calculate_hash_first(randomx_vm* machine, const void
 RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void* nextInput, size_t nextInputSize, void* output);
 RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output);
 
+/**
+ * Calculate V2 hash from the V1 hash and its input.
+ *
+ * @param input is a pointer to memory that was hashed by V1. Must not be NULL.
+ * @param inputSize is the number of bytes in the input.
+ * @param v1_in is the V1 hash (RANDOMX_HASH_SIZE bytes).
+ * @param output is a pointer to memory where the V2 hash will be stored. Must not
+ *        be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
+*/
+RANDOMX_EXPORT void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out);
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index 36b0259b..df371682 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -96,6 +96,7 @@ void printUsage(const char* executable) {
 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
 	std::cout << "  --auto        select the best options for the current CPU" << std::endl;
 	std::cout << "  --noBatch     calculate hashes one by one (default: batch)" << std::endl;
+	std::cout << "  --v2          calculate v2 hashes (default: v1)" << std::endl;
 }
 
 struct MemoryException : public std::exception {
@@ -113,7 +114,7 @@ struct DatasetAllocException : public MemoryException {
 
 using MineFunc = void(randomx_vm * vm, std::atomic<uint32_t> & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid);
 
-template<bool batch>
+template<bool batch, bool v2>
 void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) {
 	if (cpuid >= 0) {
 		int rc = set_thread_affinity(cpuid);
@@ -138,6 +139,9 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 		}
 		store32(noncePtr, nonce);
 		(batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash);
+		if (v2) {
+			randomx_calculate_hash_v2(blockTemplate, sizeof(blockTemplate), &hash, &hash);
+		}
 		result.xorWith(hash);
 		if (!batch) {
 			nonce = atomicNonce.fetch_add(1);
@@ -146,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 }
 
 int main(int argc, char** argv) {
-	bool softAes, miningMode, verificationMode, help, largePages, jit, secure;
+	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, v2;
 	bool ssse3, avx2, autoFlags, noBatch;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
@@ -172,10 +176,11 @@ int main(int argc, char** argv) {
 	readOption("--avx2", argc, argv, avx2);
 	readOption("--auto", argc, argv, autoFlags);
 	readOption("--noBatch", argc, argv, noBatch);
+	readOption("--v2", argc, argv, v2);
 
 	store32(&seed, seedValue);
 
-	std::cout << "RandomX benchmark v1.1.11" << std::endl;
+	std::cout << "RandomX benchmark v1.1.12" << std::endl;
 
 	if (help) {
 		printUsage(argv[0]);
@@ -280,11 +285,24 @@ int main(int argc, char** argv) {
 	MineFunc* func;
 
 	if (noBatch) {
-		func = &mine<false>;
+		if (v2) {
+			std::cout << " - v2 hashes" << std::endl;
+			func = &mine<false, true>;
+		}
+		else {
+			func = &mine<false, false>;
+		}
 	}
 	else {
-		func = &mine<true>;
-		std::cout << " - batch mode" << std::endl;
+		if (v2) {
+			//TODO: support batch mode with v2
+			std::cout << " - v2 hashes" << std::endl;
+			func = &mine<false, true>;
+		}
+		else {
+			std::cout << " - batch mode" << std::endl;
+			func = &mine<true, false>;
+		}
 	}
 
 	std::cout << "Initializing";
@@ -376,7 +394,7 @@ int main(int argc, char** argv) {
 			randomx_release_cache(cache);
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
-		if (noncesCount == 1000 && seedValue == 0)
+		if (noncesCount == 1000 && seedValue == 0 && !v2)
 			std::cout << "Reference result:  10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl;
 		if (!miningMode) {
 			std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl;
diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp
index 412585b1..8df2d901 100644
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@@ -34,6 +34,14 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) {
 	randomx_calculate_hash(vm, input, H - 1, output);
 }
 
+template<size_t K, size_t H>
+void calcStringHashV2(const char(&key)[K], const char(&input)[H], void* output) {
+	initCache(key);
+	assert(vm != nullptr);
+	randomx_calculate_hash(vm, input, H - 1, output);
+	randomx_calculate_hash_v2(input, H - 1, output, output);
+}
+
 template<size_t K, size_t H>
 void calcHexHash(const char(&key)[K], const char(&hex)[H], void* output) {
 	initCache(key);
@@ -1082,6 +1090,22 @@ int main() {
 		assert(rx_get_rounding_mode() == RoundToNearest);
 	});
 
+	if (RANDOMX_HAVE_COMPILER) {
+		randomx_destroy_vm(vm);
+		vm = nullptr;
+#ifdef RANDOMX_FORCE_SECURE
+		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT | RANDOMX_FLAG_SECURE, cache, nullptr);
+#else
+		vm = randomx_create_vm(RANDOMX_FLAG_DEFAULT, cache, nullptr);
+#endif
+	}
+
+	runTest("RandomX v2 hash test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+		char hash[RANDOMX_HASH_SIZE];
+		calcStringHashV2("test key 000", "This is a test", &hash);
+		assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919"));
+	});
+
 	randomx_destroy_vm(vm);
 	vm = nullptr;
 
diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj
index e0625c88..fcc66c99 100644
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@@ -156,7 +156,7 @@ SET ERRORLEVEL = 0</Command>
     <ClCompile Include="..\src\reciprocal.c" />
     <ClCompile Include="..\src\soft_aes.cpp" />
     <ClCompile Include="..\src\virtual_machine.cpp" />
-    <ClCompile Include="..\src\virtual_memory.cpp" />
+    <ClCompile Include="..\src\virtual_memory.c" />
   </ItemGroup>
   <ItemGroup>
     <MASM Include="..\src\jit_compiler_x86_static.asm" />
diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters
index eb4462a5..eef048a3 100644
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@@ -72,7 +72,7 @@
     <ClCompile Include="..\src\vm_interpreted.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\virtual_memory.cpp">
+    <ClCompile Include="..\src\virtual_memory.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\src\blake2_generator.cpp">

From 3f69ad7b79c94e2dcef4515c598e6007ae15ba0b Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Wed, 23 Aug 2023 09:47:27 +0200
Subject: [PATCH 02/13] Added CI tests

- Compile RandomX on a wide variety of OS and architectures
- Fixed broken x86 (32-bit) builds
- Don't use broken `fesetenv` in msys2 builds: https://sourceforge.net/p/mingw-w64/bugs/541/
---
 .github/workflows/c-cpp.yml | 223 ++++++++++++++++++++++++++++++++++++
 CMakeLists.txt              |   2 +-
 src/randomx.cpp             |  17 +++
 3 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/c-cpp.yml

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
new file mode 100644
index 00000000..22151c33
--- /dev/null
+++ b/.github/workflows/c-cpp.yml
@@ -0,0 +1,223 @@
+name: C/C++ CI
+
+on: [push, pull_request]
+
+jobs:
+  build-alpine:
+
+    timeout-minutes: 15
+    runs-on: ubuntu-22.04
+
+    strategy:
+      matrix:
+        config:
+          - {arch: x86_64, branch: latest-stable}
+          - {arch: x86, branch: latest-stable}
+          - {arch: aarch64, branch: latest-stable}
+          - {arch: armhf, branch: latest-stable}
+          - {arch: armv7, branch: latest-stable}
+          - {arch: ppc64le, branch: latest-stable}
+          - {arch: riscv64, branch: edge}
+          - {arch: s390x, branch: latest-stable}
+
+    steps:
+    - name: Setup Alpine Linux
+      uses: jirutka/setup-alpine@v1
+      with:
+        arch: ${{ matrix.config.arch }}
+        branch: ${{ matrix.config.branch }}
+
+    - name: Install dependencies
+      shell: alpine.sh --root {0}
+      run: |
+        apk add git cmake gcc g++ make
+
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Build RandomX
+      shell: alpine.sh {0}
+      run: |
+        mkdir build
+        cd build
+        cmake ..
+        make -j$(nproc)
+
+    - name: Run tests
+      shell: alpine.sh {0}
+      run: |
+        build/randomx-tests
+
+  build-ubuntu:
+
+    timeout-minutes: 5
+    runs-on: ${{ matrix.config.os }}
+
+    strategy:
+      matrix:
+        config:
+          - {os: ubuntu-20.04, c: gcc-11, cpp: g++-11}
+          - {os: ubuntu-22.04, c: gcc-12, cpp: g++-12}
+
+    steps:
+    - name: Install dependencies
+      run: |
+        sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
+        sudo apt update
+        sudo apt install -y git build-essential cmake ${{ matrix.config.c }} ${{ matrix.config.cpp }}
+
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+
+    - name: Build RandomX
+      run: |
+        mkdir build
+        cd build
+        cmake ..
+        make -j$(nproc)
+
+    - name: Run tests
+      run: |
+        build/randomx-tests
+
+  build-windows-msys2:
+
+    timeout-minutes: 15
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        config:
+          - {c: "gcc", cxx: "g++"}
+          - {c: "clang", cxx: "clang++"}
+
+    defaults:
+      run:
+        shell: msys2 {0}
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Setup MSYS2
+      uses: eine/setup-msys2@v2
+      with:
+        update: true
+        install: mingw-w64-x86_64-toolchain mingw-w64-x86_64-clang mingw-w64-x86_64-lld mingw-w64-x86_64-cmake make
+
+    - name: Build RandomX
+      run: |
+        mkdir build
+        cd build
+        cmake .. -G "Unix Makefiles" -DCMAKE_C_COMPILER=${{ matrix.config.c }} -DCMAKE_CXX_COMPILER=${{ matrix.config.cxx }}
+        make -j$(nproc)
+
+    - name: Run tests
+      run: |
+        build/randomx-tests.exe
+
+  build-windows-msbuild:
+
+    timeout-minutes: 5
+    runs-on: windows-${{ matrix.config.os }}
+
+    strategy:
+      matrix:
+        config:
+          - {arch: x64, os: 2019, vs: Visual Studio 16 2019, msbuild: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\MSBuild\\Current\\Bin\\amd64\\"}
+          - {arch: x64, os: 2022, vs: Visual Studio 17 2022, msbuild: "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\Msbuild\\Current\\Bin\\amd64\\"}
+          - {arch: Win32, os: 2019, vs: Visual Studio 16 2019, msbuild: "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Enterprise\\MSBuild\\Current\\Bin\\"}
+          - {arch: Win32, os: 2022, vs: Visual Studio 17 2022, msbuild: "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\Msbuild\\Current\\Bin\\"}
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Setup cmake
+      uses: lukka/get-cmake@latest
+
+    - name: Build RandomX
+      run: |
+        mkdir build
+        cd build
+        cmake .. -G "${{ matrix.config.vs }}" -A ${{ matrix.config.arch }}
+        & "${{ matrix.config.msbuild }}msbuild" -v:m /m /p:Configuration=Release randomx-tests.vcxproj
+
+    - name: Run tests
+      run: |
+        build/Release/randomx-tests.exe
+
+  build-macos:
+
+    timeout-minutes: 5
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: [macos-11, macos-12, macos-13]
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Install dependencies
+      run: HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake
+
+    - name: Build RandomX
+      run: |
+        mkdir build
+        cd build
+        cmake ..
+        make -j3
+
+    - name: Run tests
+      run: |
+        build/randomx-tests
+
+  build-freebsd:
+
+    timeout-minutes: 15
+    runs-on: ${{ matrix.os.host }}
+
+    strategy:
+      matrix:
+        os:
+          - name: freebsd
+            architecture: x86-64
+            version: '13.2'
+            host: ubuntu-22.04
+
+          - name: freebsd
+            architecture: arm64
+            version: '13.2'
+            host: ubuntu-22.04
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive
+
+    - name: Build RandomX
+      uses: cross-platform-actions/action@v0.19.0
+      with:
+        operating_system: ${{ matrix.os.name }}
+        architecture: ${{ matrix.os.architecture }}
+        version: ${{ matrix.os.version }}
+        shell: bash
+        run: |
+          sudo pkg install -y cmake
+          mkdir build && cd build
+          cmake ..
+          make -j2
+          ./randomx-tests
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4b6ba9e6..5ffbe011 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,7 +96,7 @@ function(add_flag flag)
 endfunction()
 
 # x86-64
-if(ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64")
+if ((CMAKE_SIZEOF_VOID_P EQUAL 8) AND (ARCH_ID STREQUAL "x86_64" OR ARCH_ID STREQUAL "x86-64" OR ARCH_ID STREQUAL "amd64"))
   list(APPEND randomx_sources
     src/jit_compiler_x86.cpp)
 
diff --git a/src/randomx.cpp b/src/randomx.cpp
index 7daaa46d..5d77ef36 100644
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@@ -36,7 +36,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cpu.hpp"
 #include <cassert>
 #include <limits>
+
+#if defined(__SSE__) || defined(__SSE2__) || (defined(_M_IX86_FP) && (_M_IX86_FP > 0))
+#define USE_CSR_INTRINSICS
+#include <xmmintrin.h>
+#else
 #include <cfenv>
+#endif
 
 extern "C" {
 
@@ -356,8 +362,14 @@ extern "C" {
 		assert(machine != nullptr);
 		assert(inputSize == 0 || input != nullptr);
 		assert(output != nullptr);
+
+#ifdef USE_CSR_INTRINSICS
+		const unsigned int fpstate = _mm_getcsr();
+#else
 		fenv_t fpstate;
 		fegetenv(&fpstate);
+#endif
+
 		alignas(16) uint64_t tempHash[8];
 		int blakeResult = blake2b(tempHash, sizeof(tempHash), input, inputSize, nullptr, 0);
 		assert(blakeResult == 0);
@@ -370,7 +382,12 @@ extern "C" {
 		}
 		machine->run(&tempHash);
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
+
+#ifdef USE_CSR_INTRINSICS
+		_mm_setcsr(fpstate);
+#else
 		fesetenv(&fpstate);
+#endif
 	}
 
 	void randomx_calculate_hash_first(randomx_vm* machine, const void* input, size_t inputSize) {

From e372827fae8058d7e881e0bb8c81d066604a5db9 Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 8 Sep 2023 22:36:45 +0200
Subject: [PATCH 03/13] fix vcxproj files

---
 vcxproj/randomx-dll.vcxproj         | 4 ++--
 vcxproj/randomx-dll.vcxproj.filters | 4 ++--
 vcxproj/randomx.vcxproj             | 4 ++--
 vcxproj/randomx.vcxproj.filters     | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vcxproj/randomx-dll.vcxproj b/vcxproj/randomx-dll.vcxproj
index 8b8ea8c0..4eaae9be 100644
--- a/vcxproj/randomx-dll.vcxproj
+++ b/vcxproj/randomx-dll.vcxproj
@@ -43,7 +43,7 @@
     <ClInclude Include="..\src\superscalar.hpp" />
     <ClInclude Include="..\src\superscalar_program.hpp" />
     <ClInclude Include="..\src\virtual_machine.hpp" />
-    <ClInclude Include="..\src\virtual_memory.hpp" />
+    <ClInclude Include="..\src\virtual_memory.h" />
     <ClInclude Include="..\src\vm_compiled.hpp" />
     <ClInclude Include="..\src\vm_compiled_light.hpp" />
     <ClInclude Include="..\src\vm_interpreted.hpp" />
@@ -74,7 +74,7 @@
     <ClCompile Include="..\src\soft_aes.cpp" />
     <ClCompile Include="..\src\superscalar.cpp" />
     <ClCompile Include="..\src\virtual_machine.cpp" />
-    <ClCompile Include="..\src\virtual_memory.cpp" />
+    <ClCompile Include="..\src\virtual_memory.c" />
     <ClCompile Include="..\src\vm_compiled.cpp" />
     <ClCompile Include="..\src\vm_compiled_light.cpp" />
     <ClCompile Include="..\src\vm_interpreted.cpp" />
diff --git a/vcxproj/randomx-dll.vcxproj.filters b/vcxproj/randomx-dll.vcxproj.filters
index 68e1b855..5b51f9f7 100644
--- a/vcxproj/randomx-dll.vcxproj.filters
+++ b/vcxproj/randomx-dll.vcxproj.filters
@@ -87,7 +87,7 @@
     <ClInclude Include="..\src\virtual_machine.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\virtual_memory.hpp">
+    <ClInclude Include="..\src\virtual_memory.h">
       <Filter>Header Files</Filter>
     </ClInclude>
     <ClInclude Include="..\src\vm_compiled.hpp">
@@ -151,7 +151,7 @@
     <ClCompile Include="..\src\virtual_machine.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\virtual_memory.cpp">
+    <ClCompile Include="..\src\virtual_memory.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\src\vm_compiled.cpp">
diff --git a/vcxproj/randomx.vcxproj b/vcxproj/randomx.vcxproj
index e0625c88..cefdc8fb 100644
--- a/vcxproj/randomx.vcxproj
+++ b/vcxproj/randomx.vcxproj
@@ -156,7 +156,7 @@ SET ERRORLEVEL = 0</Command>
     <ClCompile Include="..\src\reciprocal.c" />
     <ClCompile Include="..\src\soft_aes.cpp" />
     <ClCompile Include="..\src\virtual_machine.cpp" />
-    <ClCompile Include="..\src\virtual_memory.cpp" />
+    <ClCompile Include="..\src\virtual_memory.c" />
   </ItemGroup>
   <ItemGroup>
     <MASM Include="..\src\jit_compiler_x86_static.asm" />
@@ -198,7 +198,7 @@ SET ERRORLEVEL = 0</Command>
     <ClInclude Include="..\src\soft_aes.h" />
     <ClInclude Include="..\src\superscalar_program.hpp" />
     <ClInclude Include="..\src\virtual_machine.hpp" />
-    <ClInclude Include="..\src\virtual_memory.hpp" />
+    <ClInclude Include="..\src\virtual_memory.h" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/vcxproj/randomx.vcxproj.filters b/vcxproj/randomx.vcxproj.filters
index eb4462a5..7f055b5b 100644
--- a/vcxproj/randomx.vcxproj.filters
+++ b/vcxproj/randomx.vcxproj.filters
@@ -72,7 +72,7 @@
     <ClCompile Include="..\src\vm_interpreted.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
-    <ClCompile Include="..\src\virtual_memory.cpp">
+    <ClCompile Include="..\src\virtual_memory.c">
       <Filter>Source Files</Filter>
     </ClCompile>
     <ClCompile Include="..\src\blake2_generator.cpp">
@@ -164,7 +164,7 @@
     <ClInclude Include="..\src\virtual_machine.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="..\src\virtual_memory.hpp">
+    <ClInclude Include="..\src\virtual_memory.h">
       <Filter>Header Files</Filter>
     </ClInclude>
     <ClInclude Include="..\src\superscalar.hpp">

From 07a413b9f00b64d18cef310582427bf738abd94d Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 8 Sep 2023 22:57:09 +0200
Subject: [PATCH 04/13] rename 'hash v2' to 'commitment'

---
 src/randomx.cpp         | 10 +++++-----
 src/randomx.h           | 10 +++++-----
 src/tests/benchmark.cpp | 24 ++++++++++++------------
 src/tests/tests.cpp     |  8 ++++----
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/src/randomx.cpp b/src/randomx.cpp
index 537dde2e..c963e211 100644
--- a/src/randomx.cpp
+++ b/src/randomx.cpp
@@ -401,14 +401,14 @@ extern "C" {
 		machine->getFinalResult(output, RANDOMX_HASH_SIZE);
 	}
 
-	void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out) {
+	void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out) {
 		assert(inputSize == 0 || input != nullptr);
-		assert(v1_in != nullptr);
-		assert(v2_out != nullptr);
+		assert(hash_in != nullptr);
+		assert(com_out != nullptr);
 		blake2b_state state;
 		blake2b_init(&state, RANDOMX_HASH_SIZE);
 		blake2b_update(&state, input, inputSize);
-		blake2b_update(&state, v1_in, RANDOMX_HASH_SIZE);
-		blake2b_final(&state, v2_out, RANDOMX_HASH_SIZE);
+		blake2b_update(&state, hash_in, RANDOMX_HASH_SIZE);
+		blake2b_final(&state, com_out, RANDOMX_HASH_SIZE);
 	}
 }
diff --git a/src/randomx.h b/src/randomx.h
index d7e2d998..313bcd2e 100644
--- a/src/randomx.h
+++ b/src/randomx.h
@@ -261,15 +261,15 @@ RANDOMX_EXPORT void randomx_calculate_hash_next(randomx_vm* machine, const void*
 RANDOMX_EXPORT void randomx_calculate_hash_last(randomx_vm* machine, void* output);
 
 /**
- * Calculate V2 hash from the V1 hash and its input.
+ * Calculate a RandomX commitment from a RandomX hash and its input.
  *
- * @param input is a pointer to memory that was hashed by V1. Must not be NULL.
+ * @param input is a pointer to memory that was hashed. Must not be NULL.
  * @param inputSize is the number of bytes in the input.
- * @param v1_in is the V1 hash (RANDOMX_HASH_SIZE bytes).
- * @param output is a pointer to memory where the V2 hash will be stored. Must not
+ * @param hash_in is the output from randomx_calculate_hash* (RANDOMX_HASH_SIZE bytes).
+ * @param com_out is a pointer to memory where the commitment will be stored. Must not
  *        be NULL and at least RANDOMX_HASH_SIZE bytes must be available for writing.
 */
-RANDOMX_EXPORT void randomx_calculate_hash_v2(const void* input, size_t inputSize, const void* v1_in, void* v2_out);
+RANDOMX_EXPORT void randomx_calculate_commitment(const void* input, size_t inputSize, const void* hash_in, void* com_out);
 
 #if defined(__cplusplus)
 }
diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index df371682..d25d0c2c 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -96,7 +96,7 @@ void printUsage(const char* executable) {
 	std::cout << "  --avx2        use optimized Argon2 for AVX2 CPUs" << std::endl;
 	std::cout << "  --auto        select the best options for the current CPU" << std::endl;
 	std::cout << "  --noBatch     calculate hashes one by one (default: batch)" << std::endl;
-	std::cout << "  --v2          calculate v2 hashes (default: v1)" << std::endl;
+	std::cout << "  --commit      calculate commitments instead of hashes (default: hashes)" << std::endl;
 }
 
 struct MemoryException : public std::exception {
@@ -114,7 +114,7 @@ struct DatasetAllocException : public MemoryException {
 
 using MineFunc = void(randomx_vm * vm, std::atomic<uint32_t> & atomicNonce, AtomicHash & result, uint32_t noncesCount, int thread, int cpuid);
 
-template<bool batch, bool v2>
+template<bool batch, bool commit>
 void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result, uint32_t noncesCount, int thread, int cpuid = -1) {
 	if (cpuid >= 0) {
 		int rc = set_thread_affinity(cpuid);
@@ -139,8 +139,8 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 		}
 		store32(noncePtr, nonce);
 		(batch ? randomx_calculate_hash_next : randomx_calculate_hash)(vm, blockTemplate, sizeof(blockTemplate), &hash);
-		if (v2) {
-			randomx_calculate_hash_v2(blockTemplate, sizeof(blockTemplate), &hash, &hash);
+		if (commit) {
+			randomx_calculate_commitment(blockTemplate, sizeof(blockTemplate), &hash, &hash);
 		}
 		result.xorWith(hash);
 		if (!batch) {
@@ -150,7 +150,7 @@ void mine(randomx_vm* vm, std::atomic<uint32_t>& atomicNonce, AtomicHash& result
 }
 
 int main(int argc, char** argv) {
-	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, v2;
+	bool softAes, miningMode, verificationMode, help, largePages, jit, secure, commit;
 	bool ssse3, avx2, autoFlags, noBatch;
 	int noncesCount, threadCount, initThreadCount;
 	uint64_t threadAffinity;
@@ -176,7 +176,7 @@ int main(int argc, char** argv) {
 	readOption("--avx2", argc, argv, avx2);
 	readOption("--auto", argc, argv, autoFlags);
 	readOption("--noBatch", argc, argv, noBatch);
-	readOption("--v2", argc, argv, v2);
+	readOption("--commit", argc, argv, commit);
 
 	store32(&seed, seedValue);
 
@@ -285,8 +285,8 @@ int main(int argc, char** argv) {
 	MineFunc* func;
 
 	if (noBatch) {
-		if (v2) {
-			std::cout << " - v2 hashes" << std::endl;
+		if (commit) {
+			std::cout << " - hash commitments" << std::endl;
 			func = &mine<false, true>;
 		}
 		else {
@@ -294,9 +294,9 @@ int main(int argc, char** argv) {
 		}
 	}
 	else {
-		if (v2) {
-			//TODO: support batch mode with v2
-			std::cout << " - v2 hashes" << std::endl;
+		if (commit) {
+			//TODO: support batch mode with commitments
+			std::cout << " - hash commitments" << std::endl;
 			func = &mine<false, true>;
 		}
 		else {
@@ -394,7 +394,7 @@ int main(int argc, char** argv) {
 			randomx_release_cache(cache);
 		std::cout << "Calculated result: ";
 		result.print(std::cout);
-		if (noncesCount == 1000 && seedValue == 0 && !v2)
+		if (noncesCount == 1000 && seedValue == 0 && !commit)
 			std::cout << "Reference result:  10b649a3f15c7c7f88277812f2e74b337a0f20ce909af09199cccb960771cfa1" << std::endl;
 		if (!miningMode) {
 			std::cout << "Performance: " << 1000 * elapsed / noncesCount << " ms per hash" << std::endl;
diff --git a/src/tests/tests.cpp b/src/tests/tests.cpp
index 8df2d901..5e1b41a3 100644
--- a/src/tests/tests.cpp
+++ b/src/tests/tests.cpp
@@ -35,11 +35,11 @@ void calcStringHash(const char(&key)[K], const char(&input)[H], void* output) {
 }
 
 template<size_t K, size_t H>
-void calcStringHashV2(const char(&key)[K], const char(&input)[H], void* output) {
+void calcStringCommitment(const char(&key)[K], const char(&input)[H], void* output) {
 	initCache(key);
 	assert(vm != nullptr);
 	randomx_calculate_hash(vm, input, H - 1, output);
-	randomx_calculate_hash_v2(input, H - 1, output, output);
+	randomx_calculate_commitment(input, H - 1, output, output);
 }
 
 template<size_t K, size_t H>
@@ -1100,9 +1100,9 @@ int main() {
 #endif
 	}
 
-	runTest("RandomX v2 hash test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
+	runTest("Commitment test", stringsEqual(RANDOMX_ARGON_SALT, "RandomX\x03"), []() {
 		char hash[RANDOMX_HASH_SIZE];
-		calcStringHashV2("test key 000", "This is a test", &hash);
+		calcStringCommitment("test key 000", "This is a test", &hash);
 		assert(equalsHex(hash, "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919"));
 	});
 

From e322218fb7f2f2d888ecfb9ab2e8293141e67db6 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Mon, 9 Oct 2023 18:38:25 +0200
Subject: [PATCH 05/13] Fixed casts from const to non-const pointers

---
 src/intrin_portable.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/intrin_portable.h b/src/intrin_portable.h
index 8c09ae88..50020c3e 100644
--- a/src/intrin_portable.h
+++ b/src/intrin_portable.h
@@ -349,7 +349,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const *p) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return *p;
 #else
-	uint32_t* ptr = (uint32_t*)p;
+	const uint32_t* ptr = (const uint32_t*)p;
 	vec_u c;
 	c.u32[0] = load32(ptr + 0);
 	c.u32[1] = load32(ptr + 1);
@@ -375,8 +375,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) {
 
 FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 	vec_u x;
-	x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
-	x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
+	x.d64[0] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0));
+	x.d64[1] = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4));
 	return (rx_vec_f128)x.d;
 }
 
@@ -684,7 +684,7 @@ FORCE_INLINE rx_vec_i128 rx_load_vec_i128(rx_vec_i128 const* p) {
 #if defined(NATIVE_LITTLE_ENDIAN)
 	return *p;
 #else
-	uint32_t* ptr = (uint32_t*)p;
+	const uint32_t* ptr = (const uint32_t*)p;
 	rx_vec_i128 c;
 	c.u32[0] = load32(ptr + 0);
 	c.u32[1] = load32(ptr + 1);
@@ -708,8 +708,8 @@ FORCE_INLINE void rx_store_vec_i128(rx_vec_i128 *p, rx_vec_i128 b) {
 
 FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 	rx_vec_f128 x;
-	x.lo = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
-	x.hi = (double)unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
+	x.lo = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 0));
+	x.hi = (double)unsigned32ToSigned2sCompl(load32((const uint8_t*)addr + 4));
 	return x;
 }
 

From 48fa275d04a4fc5e9666d206b50c337a5cfcfe7a Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Mon, 9 Oct 2023 19:14:51 +0200
Subject: [PATCH 06/13] Avoid redundant CI runs

---
 .github/workflows/c-cpp.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
index 22151c33..47ade398 100644
--- a/.github/workflows/c-cpp.yml
+++ b/.github/workflows/c-cpp.yml
@@ -1,6 +1,9 @@
 name: C/C++ CI
 
-on: [push, pull_request]
+on:
+  push:
+    branches: [ master ]
+  pull_request:
 
 jobs:
   build-alpine:

From 027ecb85769d6ebeff0fcb5ddcbafb027209debd Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Sat, 7 Oct 2023 12:51:19 +0200
Subject: [PATCH 07/13] JIT compiler for RISC-V

---
 CMakeLists.txt                   |   36 +
 src/common.hpp                   |    7 +
 src/jit_compiler.hpp             |   42 +-
 src/jit_compiler_rv64.cpp        | 1175 ++++++++++++++++++++++++++++
 src/jit_compiler_rv64.hpp        |   69 ++
 src/jit_compiler_rv64_static.S   | 1235 ++++++++++++++++++++++++++++++
 src/jit_compiler_rv64_static.hpp |   53 ++
 src/tests/riscv64_zba.s          |    9 +
 src/tests/riscv64_zbb.s          |    9 +
 9 files changed, 2633 insertions(+), 2 deletions(-)
 create mode 100644 src/jit_compiler_rv64.cpp
 create mode 100644 src/jit_compiler_rv64.hpp
 create mode 100644 src/jit_compiler_rv64_static.S
 create mode 100644 src/jit_compiler_rv64_static.hpp
 create mode 100644 src/tests/riscv64_zba.s
 create mode 100644 src/tests/riscv64_zbb.s

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ffbe011..ebbdff2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -173,6 +173,42 @@ if(ARM_ID STREQUAL "aarch64" OR ARM_ID STREQUAL "arm64" OR ARM_ID STREQUAL "armv
   endif()
 endif()
 
+# RISC-V
+if(ARCH_ID STREQUAL "riscv64")
+  list(APPEND randomx_sources
+    src/jit_compiler_rv64_static.S
+    src/jit_compiler_rv64.cpp)
+  # cheat because cmake and ccache hate each other
+  set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY LANGUAGE C)
+  set_property(SOURCE src/jit_compiler_rv64_static.S PROPERTY XCODE_EXPLICIT_FILE_TYPE sourcecode.asm)
+
+  # default build uses the RV64GC baseline
+  set(RVARCH "rv64gc")
+
+  # for native builds, enable Zba and Zbb if supported by the CPU
+  if(ARCH STREQUAL "native")
+    enable_language(ASM)
+    try_run(RANDOMX_ZBA_RUN_FAIL
+        RANDOMX_ZBA_COMPILE_OK
+        ${CMAKE_CURRENT_BINARY_DIR}/
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zba.s
+        COMPILE_DEFINITIONS "-march=rv64gc_zba")
+    if (RANDOMX_ZBA_COMPILE_OK AND NOT RANDOMX_ZBA_RUN_FAIL)
+      set(RVARCH "${RVARCH}_zba")
+    endif()
+    try_run(RANDOMX_ZBB_RUN_FAIL
+        RANDOMX_ZBB_COMPILE_OK
+        ${CMAKE_CURRENT_BINARY_DIR}/
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/tests/riscv64_zbb.s
+        COMPILE_DEFINITIONS "-march=rv64gc_zbb")
+    if (RANDOMX_ZBB_COMPILE_OK AND NOT RANDOMX_ZBB_RUN_FAIL)
+      set(RVARCH "${RVARCH}_zbb")
+    endif()
+  endif()
+
+  add_flag("-march=${RVARCH}")
+endif()
+
 set(RANDOMX_INCLUDE "${CMAKE_CURRENT_SOURCE_DIR}/src" CACHE STRING "RandomX Include path")
 
 add_library(randomx ${randomx_sources})
diff --git a/src/common.hpp b/src/common.hpp
index a77feb3b..f4b85342 100644
--- a/src/common.hpp
+++ b/src/common.hpp
@@ -116,12 +116,19 @@ namespace randomx {
 
 #if defined(_M_X64) || defined(__x86_64__)
 	#define RANDOMX_HAVE_COMPILER 1
+	#define RANDOMX_COMPILER_X86
 	class JitCompilerX86;
 	using JitCompiler = JitCompilerX86;
 #elif defined(__aarch64__)
 	#define RANDOMX_HAVE_COMPILER 1
+	#define RANDOMX_COMPILER_A64
 	class JitCompilerA64;
 	using JitCompiler = JitCompilerA64;
+#elif defined(__riscv) && __riscv_xlen == 64
+	#define RANDOMX_HAVE_COMPILER 1
+	#define RANDOMX_COMPILER_RV64
+	class JitCompilerRV64;
+	using JitCompiler = JitCompilerRV64;
 #else
 	#define RANDOMX_HAVE_COMPILER 0
 	class JitCompilerFallback;
diff --git a/src/jit_compiler.hpp b/src/jit_compiler.hpp
index 17fdad4e..5b76fa5f 100644
--- a/src/jit_compiler.hpp
+++ b/src/jit_compiler.hpp
@@ -28,10 +28,48 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #pragma once
 
-#if defined(_M_X64) || defined(__x86_64__)
+#include "common.hpp"
+
+namespace randomx {
+
+	struct CodeBuffer {
+		uint8_t* code;
+		int32_t codePos;
+		int32_t rcpCount;
+
+		void emit(const uint8_t* src, int32_t len) {
+			memcpy(&code[codePos], src, len);
+			codePos += len;
+		}
+
+		template<typename T>
+		void emit(T src) {
+			memcpy(&code[codePos], &src, sizeof(src));
+			codePos += sizeof(src);
+		}
+
+		void emitAt(int32_t codePos, const uint8_t* src, int32_t len) {
+			memcpy(&code[codePos], src, len);
+		}
+
+		template<typename T>
+		void emitAt(int32_t codePos, T src) {
+			memcpy(&code[codePos], &src, sizeof(src));
+		}
+	};
+
+	struct CompilerState : public CodeBuffer {
+		int32_t instructionOffsets[RANDOMX_PROGRAM_SIZE];
+		int registerUsage[RegistersCount];
+	};
+}
+
+#if defined(RANDOMX_COMPILER_X86)
 #include "jit_compiler_x86.hpp"
-#elif defined(__aarch64__)
+#elif defined(RANDOMX_COMPILER_A64)
 #include "jit_compiler_a64.hpp"
+#elif defined(RANDOMX_COMPILER_RV64)
+#include "jit_compiler_rv64.hpp"
 #else
 #include "jit_compiler_fallback.hpp"
 #endif
diff --git a/src/jit_compiler_rv64.cpp b/src/jit_compiler_rv64.cpp
new file mode 100644
index 00000000..301c294c
--- /dev/null
+++ b/src/jit_compiler_rv64.cpp
@@ -0,0 +1,1175 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdexcept>
+#include <cstring>
+#include <climits>
+#include <cassert>
+#include "jit_compiler_rv64.hpp"
+#include "jit_compiler_rv64_static.hpp"
+#include "superscalar.hpp"
+#include "program.hpp"
+#include "reciprocal.h"
+#include "virtual_memory.h"
+
+
+namespace {
+#define HANDLER_ARGS randomx::CompilerState& state, randomx::Instruction isn, int i
+	using InstructionHandler = void(HANDLER_ARGS);
+	extern InstructionHandler* opcodeMap1[256];
+}
+
+namespace rv64 {
+	constexpr uint16_t C_LUI    =     0x6001;
+	constexpr uint32_t LUI      = 0x00000037;
+	constexpr uint16_t C_ADDI   =     0x0001;
+	constexpr uint32_t ADDI     = 0x00000013;
+	constexpr uint32_t ADDIW    = 0x0000001b;
+	constexpr uint16_t C_ADD    =     0x9002;
+	constexpr uint32_t ADD      = 0x00000033;
+	constexpr uint32_t SHXADD   = 0x20000033; //Zba
+	constexpr uint32_t SLL      = 0x00001033;
+	constexpr uint32_t SRL      = 0x00005033;
+	constexpr uint32_t SLLI     = 0x00001013;
+	constexpr uint32_t C_SLLI   =     0x0002;
+	constexpr uint32_t SRLI     = 0x00005013;
+	constexpr uint32_t AND      = 0x00007033;
+	constexpr uint32_t ANDI     = 0x00007013;
+	constexpr uint16_t C_AND    =     0x8c61;
+	constexpr uint16_t C_ANDI   =     0x8801;
+	constexpr uint32_t OR       = 0x00006033;
+	constexpr uint16_t C_OR     =     0x8c41;
+	constexpr uint32_t XOR      = 0x00004033;
+	constexpr uint16_t C_XOR    =     0x8c21;
+	constexpr uint32_t LD       = 0x00003003;
+	constexpr uint16_t C_LD     =     0x6000;
+	constexpr uint16_t C_LW     =     0x4000;
+	constexpr uint32_t SD       = 0x00003023;
+	constexpr uint32_t SUB      = 0x40000033;
+	constexpr uint16_t C_SUB    =     0x8c01;
+	constexpr uint32_t MUL      = 0x02000033;
+	constexpr uint32_t MULHU    = 0x02003033;
+	constexpr uint32_t MULH     = 0x02001033;
+	constexpr uint16_t C_MV     =     0x8002;
+	constexpr uint32_t ROR      = 0x60005033; //Zbb
+	constexpr uint32_t RORI     = 0x60005013; //Zbb
+	constexpr uint32_t ROL      = 0x60001033; //Zbb
+	constexpr uint32_t FMV_X_D  = 0xe2000053;
+	constexpr uint32_t FMV_D_X  = 0xf2000053;
+	constexpr uint32_t FMV_D    = 0x22000053;
+	constexpr uint32_t FADD_D   = 0x02007053;
+	constexpr uint32_t FSUB_D   = 0x0a007053;
+	constexpr uint32_t FMUL_D   = 0x12007053;
+	constexpr uint32_t FDIV_D   = 0x1a007053;
+	constexpr uint32_t FSQRT_D  = 0x5a007053;
+	constexpr uint32_t FCVT_D_W = 0xd2000053;
+	constexpr uint32_t FSRM     = 0x00201073;
+	constexpr uint16_t C_BEQZ   =     0xc001;
+	constexpr uint32_t BEQ      = 0x00000063;
+	constexpr uint16_t C_BNEZ   =     0xe001;
+	constexpr uint32_t JAL      = 0x0000006f;
+	constexpr uint16_t C_RET    =     0x8082;
+}
+
+namespace randomx {
+
+	constexpr size_t MaxRandomXInstrCodeSize = 56;     //FDIV_M requires 56 bytes of rv64 code
+	constexpr size_t MaxSuperscalarInstrSize = 12;     //IXOR_C requires 12 bytes of rv64 code
+	constexpr size_t SuperscalarProgramHeader = 136;   //overhead per superscalar program
+	constexpr size_t CodeAlign = 4096;                 //align code size to a multiple of 4 KiB
+	constexpr size_t LiteralPoolSize = CodeAlign;
+	constexpr size_t SuperscalarLiteraPoolSize = RANDOMX_CACHE_ACCESSES * CodeAlign;
+	constexpr size_t ReserveCodeSize = CodeAlign;  //prologue, epilogue + reserve
+
+	constexpr size_t RandomXCodeSize = alignSize(LiteralPoolSize + ReserveCodeSize + MaxRandomXInstrCodeSize * RANDOMX_PROGRAM_SIZE, CodeAlign);
+	constexpr size_t SuperscalarSize = alignSize(SuperscalarLiteraPoolSize + ReserveCodeSize + (SuperscalarProgramHeader + MaxSuperscalarInstrSize * SuperscalarMaxSize) * RANDOMX_CACHE_ACCESSES, CodeAlign);
+
+	static_assert(RandomXCodeSize < INT32_MAX / 2, "RandomXCodeSize is too large");
+	static_assert(SuperscalarSize < INT32_MAX / 2, "SuperscalarSize is too large");
+
+	constexpr uint32_t CodeSize = RandomXCodeSize + SuperscalarSize;
+	constexpr uint32_t ExecutableSize = CodeSize - LiteralPoolSize;
+
+	constexpr int32_t LiteralPoolOffset = LiteralPoolSize / 2;
+	constexpr int32_t SuperScalarLiteralPoolOffset = RandomXCodeSize;
+	constexpr int32_t SuperScalarLiteralPoolRefOffset = RandomXCodeSize + (RANDOMX_CACHE_ACCESSES - 1) * LiteralPoolSize + LiteralPoolOffset;
+	constexpr int32_t SuperScalarHashOffset = SuperScalarLiteralPoolOffset + SuperscalarLiteraPoolSize;
+
+	constexpr int maskLog2(uint32_t x, int prev) {
+		return x == 1 ? prev : maskLog2(x >> 1, prev + 1);
+	}
+
+	constexpr int32_t unsigned32ToSigned2sCompl(uint32_t x) {
+		return (-1 == ~0) ? (int32_t)x : (x > INT32_MAX ? (-(int32_t)(UINT32_MAX - x) - 1) : (int32_t)x);
+	}
+
+	constexpr int MaskL1Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L1, 0);
+	constexpr int MaskL2Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L2, 0);
+	constexpr int MaskL3Shift = 32 - maskLog2(RANDOMX_SCRATCHPAD_L3, 0);
+
+	constexpr int RcpLiteralsOffset = 144;
+
+	constexpr int LiteralPoolReg = 3; //x3
+	constexpr int SpadReg = 5;  //x5
+	constexpr int DataReg = 6;  //x6
+	constexpr int SuperscalarReg = 7; //x7
+	constexpr int SshTmp1Reg = 28; //x28
+	constexpr int SshTmp2Reg = 29; //x29
+	constexpr int SshPoolReg = 30; //x30
+	constexpr int SshRcpReg = 31; //x31
+	constexpr int Tmp1Reg = 8;  //x8
+	constexpr int Tmp2Reg = 9;  //x9
+	constexpr int Tmp1RegF = 24;  //f24
+	constexpr int Tmp2RegF = 25;  //f25
+	constexpr int MaskL1Reg = 10; //x10
+	constexpr int MaskL2Reg = 11; //x11
+	constexpr int MaskFscalReg = 12; //x12
+	constexpr int MaskEclear = 13; //x13
+	constexpr int MaskEsetLo = 14; //x14
+	constexpr int MaskEsetHi = 15; //x15
+	constexpr int MaskL3Reg = 1; //x1
+	constexpr int ReturnReg = 1; //x1
+	constexpr int SpAddr0Reg = 26; //x26
+	constexpr int OffsetXC = -8; //x8-x15
+	constexpr int OffsetR = 16; //x16-x23
+	constexpr int OffsetF = 0;  //f0-f7
+	constexpr int OffsetE = 8; //f8-f15
+	constexpr int OffsetA = 16;  //f16-f23
+	constexpr int OffsetRcp = 28; //x28-x31
+	constexpr int OffsetRcpF = 22; //f26-f31
+	constexpr int OffsetSsh = 8; //x8-x15
+
+	//destination register (bit 7+)
+	constexpr int rvrd(int reg) {
+		return reg << 7;
+	}
+
+	//first source register (bit 15+)
+	constexpr int rvrs1(int reg) {
+		return reg << 15;
+	}
+
+	//second source register (bit 20+)
+	constexpr int rvrs2(int reg) {
+		return reg << 20;
+	}
+
+	//compressed source register (bit 2+)
+	constexpr int rvcrs(int reg) {
+		return reg << 2;
+	}
+
+	//base instruction: {op} x{rd}, x{rs1}, x{rs2}
+	constexpr uint32_t rvi(uint32_t op, int rd, int rs1, int rs2 = 0) {
+		return op | rvrs2(rs2) | rvrs1(rs1) | rvrd(rd);
+	}
+
+	//compressed instruction: op x{rd}, x{rs}
+	constexpr uint16_t rvc(uint16_t op, int rd, int rs) {
+		return op | rvrd(rd) | rvcrs(rs);
+	}
+
+	//compressed instruction: op x{rd}, imm6
+	constexpr uint16_t rvc(uint16_t op, int imm5, int rd, int imm40) {
+		return op | (imm5 << 12) | rvrd(rd) | (imm40 << 2);
+	}
+
+	constexpr int regR(int reg) {
+		return reg + OffsetR;
+	}
+
+	constexpr int regLoA(int reg) {
+		return 2 * reg + OffsetA;
+	}
+
+	constexpr int regHiA(int reg) {
+		return 2 * reg + OffsetA + 1;
+	}
+
+	constexpr int regLoF(int reg) {
+		return 2 * reg + OffsetF;
+	}
+
+	constexpr int regHiF(int reg) {
+		return 2 * reg + OffsetF + 1;
+	}
+
+	constexpr int regLoE(int reg) {
+		return 2 * reg + OffsetE;
+	}
+
+	constexpr int regHiE(int reg) {
+		return 2 * reg + OffsetE + 1;
+	}
+
+	constexpr int regRcp(int reg) {
+		return reg + OffsetRcp;
+	}
+
+	constexpr int regRcpF(int reg) {
+		return reg + OffsetRcpF;
+	}
+
+	constexpr int regSS(int reg) {
+		return reg + OffsetSsh;
+	}
+
+	static const uint8_t* codeLiterals = (uint8_t*)&randomx_riscv64_literals;
+	static const uint8_t* codeLiteralsEnd = (uint8_t*)&randomx_riscv64_literals_end;
+	static const uint8_t* codeDataInit = (uint8_t*)&randomx_riscv64_data_init;
+	static const uint8_t* codeFixDataCall = (uint8_t*)&randomx_riscv64_fix_data_call;
+	static const uint8_t* codePrologue = (uint8_t*)&randomx_riscv64_prologue;
+	static const uint8_t* codeLoopBegin = (uint8_t*)&randomx_riscv64_loop_begin;
+	static const uint8_t* codeDataRead = (uint8_t*)&randomx_riscv64_data_read;
+	static const uint8_t* codeDataReadLight = (uint8_t*)&randomx_riscv64_data_read_light;
+	static const uint8_t* codeFixLoopCall = (uint8_t*)&randomx_riscv64_fix_loop_call;
+	static const uint8_t* codeSpadStore = (uint8_t*)&randomx_riscv64_spad_store;
+	static const uint8_t* codeSpadStoreHardAes = (uint8_t*)&randomx_riscv64_spad_store_hardaes;
+	static const uint8_t* codeSpadStoreSoftAes = (uint8_t*)&randomx_riscv64_spad_store_softaes;
+	static const uint8_t* codeLoopEnd = (uint8_t*)&randomx_riscv64_loop_end;
+	static const uint8_t* codeFixContinueLoop = (uint8_t*)&randomx_riscv64_fix_continue_loop;
+	static const uint8_t* codeEpilogue = (uint8_t*)&randomx_riscv64_epilogue;
+	static const uint8_t* codeSoftAes = (uint8_t*)&randomx_riscv64_softaes;
+	static const uint8_t* codeProgramEnd = (uint8_t*)&randomx_riscv64_program_end;
+	static const uint8_t* codeSshInit = (uint8_t*)&randomx_riscv64_ssh_init;
+	static const uint8_t* codeSshLoad = (uint8_t*)&randomx_riscv64_ssh_load;
+	static const uint8_t* codeSshPrefetch = (uint8_t*)&randomx_riscv64_ssh_prefetch;
+	static const uint8_t* codeSshEnd = (uint8_t*)&randomx_riscv64_ssh_end;
+
+	static const int32_t sizeLiterals = codeLiteralsEnd - codeLiterals;
+	static const int32_t sizeDataInit = codePrologue - codeDataInit;
+	static const int32_t sizePrologue = codeLoopBegin - codePrologue;
+	static const int32_t sizeLoopBegin = codeDataRead - codeLoopBegin;
+	static const int32_t sizeDataRead = codeDataReadLight - codeDataRead;
+	static const int32_t sizeDataReadLight = codeSpadStore - codeDataReadLight;
+	static const int32_t sizeSpadStore = codeSpadStoreHardAes - codeSpadStore;
+	static const int32_t sizeSpadStoreSoftAes = codeLoopEnd - codeSpadStoreSoftAes;
+	static const int32_t sizeLoopEnd = codeEpilogue - codeLoopEnd;
+	static const int32_t sizeEpilogue = codeSoftAes - codeEpilogue;
+	static const int32_t sizeSoftAes = codeProgramEnd - codeSoftAes;
+	static const int32_t sizeSshInit = codeSshLoad - codeSshInit;
+	static const int32_t sizeSshLoad = codeSshPrefetch - codeSshLoad;
+	static const int32_t sizeSshPrefetch = codeSshEnd - codeSshPrefetch;
+
+	static const int32_t offsetFixDataCall = codeFixDataCall - codeDataInit;
+	static const int32_t offsetFixLoopCall = codeFixLoopCall - codeDataReadLight;
+	static const int32_t offsetFixContinueLoop = codeFixContinueLoop - codeLoopEnd;
+
+	static const int32_t LoopTopPos = LiteralPoolSize + sizeDataInit + sizePrologue;
+	static const int32_t RandomXCodePos = LoopTopPos + sizeLoopBegin;
+
+	static void clearCache(CodeBuffer& buf) {
+#ifdef __GNUC__
+		__builtin___clear_cache((char*)buf.code, (char*)(buf.code + CodeSize));
+#endif
+	}
+
+	//emits code to calculate: x{dst} = x{src} + {imm32}
+	//takes 1-3 isns, 2-10 bytes
+	static void emitImm32(CodeBuffer& buf, int32_t imm, int dst, int src = 0, int tmp = 0) {
+
+		//lower 12 bits
+		int32_t limm = (imm << 20) >> 20;
+		//upper 20 bits
+		int32_t uimm = (imm >> 12) + (limm < 0);
+
+		//If there are no upper bits, the whole thing
+		//can be done with a single instruction.
+		if (uimm == 0) {
+			//addi x{dst}, x{src}, {limm}
+			buf.emit(rvi(rv64::ADDI, dst, src, limm));
+			return;
+		}
+
+		//dst1 is the register where imm will be materialized
+		int dst1 = src != dst ? dst : tmp;
+		assert(dst1 != 0);
+		//src1 is the register that will be added to the result
+		int src1 = src != dst ? src : dst1;
+
+		//load upper bits
+		if (uimm >= -32 && uimm <= 31) {
+			//c.lui x{dst1}, {uimm}
+			buf.emit(rvc(rv64::C_LUI, (uimm < 0), dst1, (uimm & 31)));
+		}
+		else {
+			//lui x{dst1}, {uimm}
+			buf.emit(rv64::LUI | (uimm << 12) | rvrd(dst1));
+		}
+		//load lower bits
+		if (limm != 0) {
+			//Note: this must be addiw NOT addi, otherwise the upper 32 bits
+			//of the 64-bit register will be incorrect.
+			//addiw x{dst1}, x{dst1}, {limm}
+			buf.emit(rvi(rv64::ADDIW, dst1, dst1, limm));
+		}
+		//add src
+		if (src1 != 0) {
+			//c.add x{dst}, x{src1}
+			buf.emit(rvc(rv64::C_ADD, dst, src1));
+		}
+	}
+
+	//x9 = &Scratchpad[isn.imm]
+	//takes 3 isns, 10 bytes
+	static void genAddressRegImm(CodeBuffer& buf, const Instruction& isn) {
+		//signed offset 8-byte aligned
+		int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32()) & ScratchpadL3Mask;
+		//x9 = x5 + {imm}
+		emitImm32(buf, imm, Tmp2Reg, SpadReg, Tmp1Reg);
+	}
+
+	//x9 = &Scratchpad[isn.src + isn.imm] (for reading)
+	//takes 5 isns, 12 bytes
+	static void genAddressReg(CodeBuffer& buf, const Instruction& isn) {
+		int shift, maskReg;
+		if (isn.getModMem()) {
+			shift = MaskL1Shift;
+			maskReg = MaskL1Reg;
+		}
+		else {
+			shift = MaskL2Shift;
+			maskReg = MaskL2Reg;
+		}
+		int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+		imm = (imm << shift) >> shift;
+		//x9 = x{src} + {imm}
+		emitImm32(buf, imm, Tmp2Reg, regR(isn.src), Tmp1Reg);
+		//c.and x9, x{maskReg}
+		buf.emit(rvc(rv64::C_AND, (Tmp2Reg + OffsetXC), (maskReg + OffsetXC)));
+		//c.add x9, x{spadReg}
+		buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg));
+	}
+
+	//x8 = Scratchpad[isn]
+	static void loadFromScratchpad(CodeBuffer& buf, const Instruction& isn) {
+		if (isn.src != isn.dst) {
+			//x9 = &Scratchpad[isn.src + isn.imm]
+			genAddressReg(buf, isn);
+		}
+		else {
+			///x9 = &Scratchpad[isn.imm]
+			genAddressRegImm(buf, isn);
+		}
+		//c.ld x8, 0(x9)
+		buf.emit(rvc(rv64::C_LD, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC));
+	}
+
+	//x9 = &Scratchpad[isn.dst + isn.imm32] (for writing)
+	//takes 5 isns, 12-16 bytes
+	static void genAddressRegDst(CodeBuffer& buf, const Instruction& isn) {
+		if (isn.getModCond() < StoreL3Condition) {
+			int shift, maskReg;
+			if (isn.getModMem()) {
+				shift = MaskL1Shift;
+				maskReg = MaskL1Reg;
+			}
+			else {
+				shift = MaskL2Shift;
+				maskReg = MaskL2Reg;
+			}
+			int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+			imm = (imm << shift) >> shift;
+			//x9 = x{dst} + {imm}
+			emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg);
+			//c.and x9, x{maskReg}
+			buf.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, maskReg + OffsetXC));
+			//c.add x9, x5
+			buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg));
+		}
+		else {
+			int shift = MaskL3Shift;
+			int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+			imm = (imm << shift) >> shift;
+			//x9 = x{dst} + {imm}
+			emitImm32(buf, imm, Tmp2Reg, regR(isn.dst), Tmp1Reg);
+			//and x9, x9, x1
+			buf.emit(rvi(rv64::AND, Tmp2Reg, Tmp2Reg, MaskL3Reg));
+			//c.add x9, x5
+			buf.emit(rvc(rv64::C_ADD, Tmp2Reg, SpadReg));
+		}
+	}
+
+	static void emitRcpLiteral1(CodeBuffer& buf, uint64_t literal) {
+		//first 238 at positive offsets
+		if (buf.rcpCount < 238) {
+			buf.emitAt(LiteralPoolOffset + RcpLiteralsOffset + buf.rcpCount * 8, literal);
+			buf.rcpCount++;
+		}
+		//next 256 at negative offsets
+		else if (buf.rcpCount < 494) {
+			buf.emitAt(buf.rcpCount * 8 - (2048 - RcpLiteralsOffset), literal);
+			buf.rcpCount++;
+		}
+		else {
+			//checked at compile time, but double-check here
+			throw std::runtime_error("Literal pool overflow");
+		}
+	}
+
+	static void emitRcpLiteral2(CodeBuffer& buf, uint64_t literal, int32_t numLiterals) {
+		//store the current literal in the pool
+		int32_t offset = 2040 - buf.rcpCount * 8;
+		buf.emitAt(SuperScalarLiteralPoolRefOffset + offset, literal);
+		buf.rcpCount++;
+		if (buf.rcpCount >= numLiterals) {
+			return;
+		}
+		//load the next literal
+		offset -= 8;
+		int32_t imm = offset & 0xfff;
+		//ld x31, {offset}(x30)
+		buf.emit(rvi(rv64::LD, SshRcpReg, SshPoolReg, imm));
+		if (imm == 0x800) {
+			//move pool pointer back 4KB
+			//c.lui x29, 0xfffff
+			buf.emit(rvc(rv64::C_LUI, 1, SshTmp2Reg, 31));
+			//c.add x30, x29
+			buf.emit(rvc(rv64::C_ADD, SshPoolReg, SshTmp2Reg));
+		}
+	}
+
+	static void emitJump(CodeBuffer& buf, int dst, int32_t codePos, int32_t targetPos) {
+		int32_t imm = targetPos - codePos;
+		int32_t imm20 = (imm < 0) << 11;
+		int32_t imm1912 = (imm >> 7) & 8160;
+		int32_t imm11 = (imm >> 11) & 1;
+		int32_t imm101 = imm & 2046;
+		//jal x{dst}, {imm}
+		buf.emitAt(codePos, rvi(rv64::JAL, dst + imm1912, 0, imm20 + imm101 + imm11));
+	}
+
+	static void emitInstruction(CompilerState& state, Instruction isn, int i) {
+		state.instructionOffsets[i] = state.codePos;
+		opcodeMap1[isn.opcode](state, isn, i);
+	}
+
+	static void emitProgramPrefix(CompilerState& state, Program& prog, ProgramConfiguration& pcfg) {
+		state.codePos = RandomXCodePos;
+		state.rcpCount = 0;
+		state.emitAt(LiteralPoolOffset + sizeLiterals, pcfg.eMask[0]);
+		state.emitAt(LiteralPoolOffset + sizeLiterals + 8, pcfg.eMask[1]);
+		for (unsigned i = 0; i < RegistersCount; ++i) {
+			state.registerUsage[i] = -1;
+		}
+		for (unsigned i = 0; i < prog.getSize(); ++i) {
+			Instruction instr = prog(i);
+			instr.src %= RegistersCount;
+			instr.dst %= RegistersCount;
+			emitInstruction(state, instr, i);
+		}
+	}
+
+	static void emitProgramSuffix(CompilerState& state, ProgramConfiguration& pcfg) {
+		state.emit(codeSpadStore, sizeSpadStore);
+		int32_t fixPos = state.codePos;
+		state.emit(codeLoopEnd, sizeLoopEnd);
+		//xor x26, x{readReg0}, x{readReg1}
+		state.emitAt(fixPos, rvi(rv64::XOR, SpAddr0Reg, regR(pcfg.readReg0), regR(pcfg.readReg1)));
+		fixPos += offsetFixContinueLoop;
+		//j LoopTop
+		emitJump(state, 0, fixPos, LoopTopPos);
+		state.emit(codeEpilogue, sizeEpilogue);
+	}
+
+	static void generateSuperscalarCode(CodeBuffer& buf, Instruction isn, const std::vector<uint64_t>& reciprocalCache) {
+		switch ((SuperscalarInstructionType)isn.opcode)
+		{
+		case randomx::SuperscalarInstructionType::ISUB_R:
+			//c.sub x{dst}, x{src}
+			buf.emit(rvc(rv64::C_SUB, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC));
+			break;
+		case randomx::SuperscalarInstructionType::IXOR_R:
+			//c.xor x{dst}, x{src}
+			buf.emit(rvc(rv64::C_XOR, regSS(isn.dst) + OffsetXC, regSS(isn.src) + OffsetXC));
+			break;
+		case randomx::SuperscalarInstructionType::IADD_RS:
+			{
+				int shift = isn.getModShift();
+				if (shift == 0) {
+					//c.add x{dst}, x{src}
+					buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), regSS(isn.src)));
+				}
+				else {
+#ifdef __riscv_zba
+				//sh{1,2,3}add x{dst}, x{src}, x{dst}
+				buf.emit(rv64::SHXADD | rvrs2(regSS(isn.dst)) | rvrs1(regSS(isn.src)) | (shift << 13) | rvrd(regSS(isn.dst)));
+#else
+				//slli x28, x{src}, {shift}
+				buf.emit(rvi(rv64::SLLI, SshTmp1Reg, regSS(isn.src), shift));
+				//c.add x{dst}, x28
+				buf.emit(rvc(rv64::C_ADD, regSS(isn.dst), SshTmp1Reg));
+#endif
+				}
+			}
+			break;
+		case randomx::SuperscalarInstructionType::IMUL_R:
+			//mul x{dst}, x{dst}, x{src}
+			buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), regSS(isn.src)));
+			break;
+		case randomx::SuperscalarInstructionType::IROR_C:
+			{
+#ifdef __riscv_zbb
+				int32_t imm = isn.getImm32() & 63;
+				//rori x{dst}, x{dst}, {imm}
+				buf.emit(rvi(rv64::RORI, regSS(isn.dst), regSS(isn.dst), imm));
+#else
+				int32_t immr = isn.getImm32() & 63;
+				int32_t imml = -immr & 63;
+				int32_t imml5 = imml >> 5;
+				int32_t imml40 = imml & 31;
+				//srli x28, x{dst}, {immr}
+				buf.emit(rvi(rv64::SRLI, SshTmp1Reg, regSS(isn.dst), immr));
+				//c.slli x{dst}, {imml}
+				buf.emit(rvc(rv64::C_SLLI, imml5, regSS(isn.dst), imml40));
+				//or x{dst}, x{dst}, x28
+				buf.emit(rvi(rv64::OR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg));
+#endif
+			}
+			break;
+		case randomx::SuperscalarInstructionType::IADD_C7:
+		case randomx::SuperscalarInstructionType::IADD_C8:
+		case randomx::SuperscalarInstructionType::IADD_C9:
+			{
+				int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+				//x{dst} = x{dst} + {imm}
+				emitImm32(buf, imm, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg);
+			}
+			break;
+		case randomx::SuperscalarInstructionType::IXOR_C7:
+		case randomx::SuperscalarInstructionType::IXOR_C8:
+		case randomx::SuperscalarInstructionType::IXOR_C9:
+			{
+				int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+				//x28 = {imm}
+				emitImm32(buf, imm, SshTmp1Reg);
+				//xor x{dst}, x{dst}, x28
+				buf.emit(rvi(rv64::XOR, regSS(isn.dst), regSS(isn.dst), SshTmp1Reg));
+			}
+			break;
+		case randomx::SuperscalarInstructionType::IMULH_R:
+			//mulhu x{dst}, x{dst}, x{src}
+			buf.emit(rvi(rv64::MULHU, regSS(isn.dst), regSS(isn.dst), regSS(isn.src)));
+			break;
+		case randomx::SuperscalarInstructionType::ISMULH_R:
+			//mulh x{dst}, x{dst}, x{src}
+			buf.emit(rvi(rv64::MULH, regSS(isn.dst), regSS(isn.dst), regSS(isn.src)));
+			break;
+		case randomx::SuperscalarInstructionType::IMUL_RCP:
+			//mul x{dst}, x{dst}, x31
+			buf.emit(rvi(rv64::MUL, regSS(isn.dst), regSS(isn.dst), SshRcpReg));
+			//load the next literal into x31
+			emitRcpLiteral2(buf, reciprocalCache[isn.getImm32()], reciprocalCache.size());
+			break;
+		default:
+			UNREACHABLE;
+		}
+	}
+
+	size_t JitCompilerRV64::getCodeSize() {
+		return CodeSize;
+	}
+
+	JitCompilerRV64::JitCompilerRV64() {
+		state.code = (uint8_t*)allocMemoryPages(CodeSize);
+		if (state.code == nullptr)
+			throw std::runtime_error("allocMemoryPages");
+		state.emitAt(LiteralPoolOffset, codeLiterals, sizeLiterals);
+		state.emitAt(LiteralPoolSize, codeDataInit, sizeDataInit + sizePrologue + sizeLoopBegin);
+		entryDataInit = state.code + LiteralPoolSize;
+		entryProgram = state.code + LiteralPoolSize + sizeDataInit;
+		//jal x1, SuperscalarHash
+		emitJump(state, ReturnReg, LiteralPoolSize + offsetFixDataCall, SuperScalarHashOffset);
+	}
+
+	JitCompilerRV64::~JitCompilerRV64() {
+		freePagedMemory(state.code, CodeSize);
+	}
+
+	void JitCompilerRV64::enableAll() {
+		setPagesRWX(entryDataInit, ExecutableSize);
+	}
+
+	void JitCompilerRV64::enableWriting() {
+		setPagesRW(entryDataInit, ExecutableSize);
+	}
+
+	void JitCompilerRV64::enableExecution() {
+		setPagesRX(entryDataInit, ExecutableSize);
+	}
+
+	void JitCompilerRV64::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
+		emitProgramPrefix(state, prog, pcfg);
+		int32_t fixPos = state.codePos;
+		state.emit(codeDataRead, sizeDataRead);
+		//xor x8, x{readReg2}, x{readReg3}
+		state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3)));
+		emitProgramSuffix(state, pcfg);
+		clearCache(state);
+	}
+
+	void JitCompilerRV64::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
+		emitProgramPrefix(state, prog, pcfg);
+		int32_t fixPos = state.codePos;
+		state.emit(codeDataReadLight, sizeDataReadLight);
+		//xor x8, x{readReg2}, x{readReg3}
+		state.emitAt(fixPos, rvi(rv64::XOR, Tmp1Reg, regR(pcfg.readReg2), regR(pcfg.readReg3)));
+		int32_t imm = datasetOffset / CacheLineSize;
+		int32_t limm = (imm << 20) >> 20;
+		int32_t uimm = (imm >> 12) + (limm < 0);
+		//lui x9, {uimm}
+		state.emitAt(fixPos + 4, rv64::LUI | (uimm << 12) | rvrd(Tmp2Reg));
+		//addi x9, x9, {limm}
+		state.emitAt(fixPos + 8, rvi(rv64::ADDI, Tmp2Reg, Tmp2Reg, limm));
+		fixPos += offsetFixLoopCall;
+		//jal x1, SuperscalarHash
+		emitJump(state, ReturnReg, fixPos, SuperScalarHashOffset);
+		emitProgramSuffix(state, pcfg);
+		clearCache(state);
+	}
+
+	void JitCompilerRV64::generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t>& reciprocalCache) {
+		state.codePos = SuperScalarHashOffset;
+		state.rcpCount = 0;
+		state.emit(codeSshInit, sizeSshInit);
+		for (unsigned j = 0; j < RANDOMX_CACHE_ACCESSES; ++j) {
+			SuperscalarProgram& prog = programs[j];
+			for (unsigned i = 0; i < prog.getSize(); ++i) {
+				Instruction instr = prog(i);
+				generateSuperscalarCode(state, instr, reciprocalCache);
+			}
+			state.emit(codeSshLoad, sizeSshLoad);
+			if (j < RANDOMX_CACHE_ACCESSES - 1) {
+				int32_t fixPos = state.codePos;
+				state.emit(codeSshPrefetch, sizeSshPrefetch);
+				//and x7, x{addrReg}, x7
+				state.emitAt(fixPos, rvi(rv64::AND, SuperscalarReg, regSS(prog.getAddressRegister()), SuperscalarReg));
+			}
+		}
+		state.emit(rvc(rv64::C_RET, 0, 0));
+		clearCache(state);
+	}
+
+	static void v1_IADD_RS(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		int shift = isn.getModShift();
+		if (shift == 0) {
+			//c.add x{dst}, x{src}
+			state.emit(rvc(rv64::C_ADD, regR(isn.dst), regR(isn.src)));
+		}
+		else {
+#ifdef __riscv_zba
+			//sh{1,2,3}add x{dst}, x{src}, x{dst}
+			state.emit(rv64::SHXADD | rvrs2(regR(isn.dst)) | rvrs1(regR(isn.src)) | (shift << 13) | rvrd(regR(isn.dst)));
+#else
+			//slli x8, x{src}, {shift}
+			state.emit(rvi(rv64::SLLI, Tmp1Reg, regR(isn.src), shift));
+			//c.add x{dst}, x8
+			state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg));
+#endif
+		}
+		if (isn.dst == RegisterNeedsDisplacement) {
+			int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+			//x{dst} = x{dst} + {imm}
+			emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg);
+		}
+	}
+
+	static void v1_IADD_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//c.add x{dst}, x8
+		state.emit(rvc(rv64::C_ADD, regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_ISUB_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		if (isn.src != isn.dst) {
+			//sub x{dst}, x{dst}, x{src}
+			state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+		}
+		else {
+			int32_t imm = unsigned32ToSigned2sCompl(-isn.getImm32()); //convert to add
+			//x{dst} = x{dst} + {-imm}
+			emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp1Reg);
+		}
+	}
+
+	static void v1_ISUB_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//sub x{dst}, x{dst}, x8
+		state.emit(rvi(rv64::SUB, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_IMUL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		if (isn.src != isn.dst) {
+			//mul x{dst}, x{dst}, x{src}
+			state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+		}
+		else {
+			int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+			//x8 = {imm}
+			emitImm32(state, imm, Tmp1Reg);
+			//mul x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+		}
+	}
+
+	static void v1_IMUL_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//mul x{dst}, x{dst}, x8
+		state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_IMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		//mulhu x{dst}, x{dst}, x{src}
+		state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+	}
+
+	static void v1_IMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//mulhu x{dst}, x{dst}, x8
+		state.emit(rvi(rv64::MULHU, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_ISMULH_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		//mulh x{dst}, x{dst}, x{src}
+		state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+	}
+
+	static void v1_ISMULH_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//mulh x{dst}, x{dst}, x8
+		state.emit(rvi(rv64::MULH, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_IMUL_RCP(HANDLER_ARGS) {
+		uint64_t divisor = isn.getImm32();
+		if (!isZeroOrPowerOf2(divisor)) {
+			state.registerUsage[isn.dst] = i;
+			if (state.rcpCount < 4) {
+				//mul x{dst}, x{dst}, x{rcp}
+				state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), regRcp(state.rcpCount)));
+			}
+			else if (state.rcpCount < 10) {
+				//fmv.x.d x8, f{rcp}
+				state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regRcpF(state.rcpCount)));
+				//mul x{dst}, x{dst}, x8
+				state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+			}
+			else {
+				int32_t offset = RcpLiteralsOffset + state.rcpCount * 8;
+				//ld x8, {offset}(x3)
+				state.emit(rvi(rv64::LD, Tmp1Reg, LiteralPoolReg, offset));
+				//mul x{dst}, x{dst}, x8
+				state.emit(rvi(rv64::MUL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+			}
+			emitRcpLiteral1(state, randomx_reciprocal_fast(divisor));
+		}
+	}
+
+	static void v1_INEG_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		//sub x{dst}, x0, x{dst}
+		state.emit(rvi(rv64::SUB, regR(isn.dst), 0, regR(isn.dst)));
+	}
+
+	static void v1_IXOR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		if (isn.src != isn.dst) {
+			//xor x{dst}, x{dst}, x{src}
+			state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+		}
+		else {
+			int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+			//x8 = {imm}
+			emitImm32(state, imm, Tmp1Reg);
+			//xor x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+		}
+	}
+
+	static void v1_IXOR_M(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+		loadFromScratchpad(state, isn);
+		//xor x{dst}, x{dst}, x8
+		state.emit(rvi(rv64::XOR, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+	}
+
+	static void v1_IROR_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+#ifdef __riscv_zbb
+		if (isn.src != isn.dst) {
+			//ror x{dst}, x{dst}, x{src}
+			state.emit(rvi(rv64::ROR, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+		}
+		else {
+			int32_t imm = isn.getImm32() & 63;
+			//rori x{dst}, x{dst}, {imm}
+			state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm));
+		}
+#else
+		if (isn.src != isn.dst) {
+			//sub x8, x0, x{src}
+			state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src)));
+			//srl x9, x{dst}, x{src}
+			state.emit(rvi(rv64::SRL, Tmp2Reg, regR(isn.dst), regR(isn.src)));
+			//sll x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::SLL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+			//or x{dst}, x{dst}, x9
+			state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg));
+		}
+		else {
+			int32_t immr = isn.getImm32() & 63;
+			int32_t imml = -immr & 63;
+			int32_t imml5 = imml >> 5;
+			int32_t imml40 = imml & 31;
+			//srli x8, x{dst}, {immr}
+			state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr));
+			//c.slli x{dst}, {imml}
+			state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40));
+			//or x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+		}
+#endif
+	}
+
+	static void v1_IROL_R(HANDLER_ARGS) {
+		state.registerUsage[isn.dst] = i;
+#ifdef __riscv_zbb
+		if (isn.src != isn.dst) {
+			//rol x{dst}, x{dst}, x{src}
+			state.emit(rvi(rv64::ROL, regR(isn.dst), regR(isn.dst), regR(isn.src)));
+		}
+		else {
+			int32_t imm = -isn.getImm32() & 63;
+			//rori x{dst}, x{dst}, {imm}
+			state.emit(rvi(rv64::RORI, regR(isn.dst), regR(isn.dst), imm));
+		}
+#else
+		if (isn.src != isn.dst) {
+			//sub x8, x0, x{src}
+			state.emit(rvi(rv64::SUB, Tmp1Reg, 0, regR(isn.src)));
+			//sll x9, x{dst}, x{src}
+			state.emit(rvi(rv64::SLL, Tmp2Reg, regR(isn.dst), regR(isn.src)));
+			//srl x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::SRL, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+			//or x{dst}, x{dst}, x9
+			state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp2Reg));
+		}
+		else {
+			int32_t imml = isn.getImm32() & 63;
+			int32_t immr = -imml & 63;
+			int32_t imml5 = imml >> 5;
+			int32_t imml40 = imml & 31;
+			//srli x8, x{dst}, {immr}
+			state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.dst), immr));
+			//c.slli x{dst}, {imml}
+			state.emit(rvc(rv64::C_SLLI, imml5, regR(isn.dst), imml40));
+			//or x{dst}, x{dst}, x8
+			state.emit(rvi(rv64::OR, regR(isn.dst), regR(isn.dst), Tmp1Reg));
+		}
+#endif
+	}
+
+	static void v1_ISWAP_R(HANDLER_ARGS) {
+		if (isn.src != isn.dst) {
+			state.registerUsage[isn.dst] = i;
+			state.registerUsage[isn.src] = i;
+			//c.mv x8, x{dst}
+			state.emit(rvc(rv64::C_MV, Tmp1Reg, regR(isn.dst)));
+			//c.mv x{dst}, x{src}
+			state.emit(rvc(rv64::C_MV, regR(isn.dst), regR(isn.src)));
+			//c.mv x{src}, x8
+			state.emit(rvc(rv64::C_MV, regR(isn.src), Tmp1Reg));
+		}
+	}
+
+	static void v1_FSWAP_R(HANDLER_ARGS) {
+		//fmv.d f24, f{dst_lo}
+		state.emit(rvi(rv64::FMV_D, Tmp1RegF, regLoF(isn.dst), regLoF(isn.dst)));
+		//fmv.d f{dst_lo}, f{dst_hi}
+		state.emit(rvi(rv64::FMV_D, regLoF(isn.dst), regHiF(isn.dst), regHiF(isn.dst)));
+		//fmv.d f{dst_hi}, f24
+		state.emit(rvi(rv64::FMV_D, regHiF(isn.dst), Tmp1RegF, Tmp1RegF));
+	}
+
+	static void v1_FADD_R(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		isn.src %= RegisterCountFlt;
+		//fadd.d f{dst_lo}, f{dst_lo}, f{src_lo}
+		state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src)));
+		//fadd.d f{dst_hi}, f{dst_hi}, f{src_hi}
+		state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src)));
+	}
+
+	static void v1_FADD_M(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		//x9 = mem
+		genAddressReg(state, isn);
+		//lw x8, 0(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC));
+		//lw x9, 4(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC));
+		//fcvt.d.w f24, x8
+		state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg));
+		//fcvt.d.w f25, x9
+		state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg));
+		//fadd.d f{dst_lo}, f{dst_lo}, f24
+		state.emit(rvi(rv64::FADD_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF));
+		//fadd.d f{dst_hi}, f{dst_hi}, f25
+		state.emit(rvi(rv64::FADD_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF));
+	}
+
+	static void v1_FSUB_R(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		isn.src %= RegisterCountFlt;
+		//fsub.d f{dst_lo}, f{dst_lo}, f{src_lo}
+		state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), regLoA(isn.src)));
+		//fsub.d f{dst_hi}, f{dst_hi}, f{src_hi}
+		state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), regHiA(isn.src)));
+	}
+
+	static void v1_FSUB_M(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		//x9 = mem
+		genAddressReg(state, isn);
+		//c.lw x8, 0(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC));
+		//c.lw x9, 4(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC));
+		//fcvt.d.w f24, x8
+		state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg));
+		//fcvt.d.w f25, x9
+		state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg));
+		//fsub.d f{dst_lo}, f{dst_lo}, f24
+		state.emit(rvi(rv64::FSUB_D, regLoF(isn.dst), regLoF(isn.dst), Tmp1RegF));
+		//fsub.d f{dst_hi}, f{dst_hi}, f25
+		state.emit(rvi(rv64::FSUB_D, regHiF(isn.dst), regHiF(isn.dst), Tmp2RegF));
+	}
+
+	static void v1_FSCAL_R(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		//fmv.x.d x8, f{dst_lo}
+		state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, regLoF(isn.dst)));
+		//fmv.x.d x9, f{dst_hi}
+		state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, regHiF(isn.dst)));
+		//c.xor x8, x12
+		state.emit(rvc(rv64::C_XOR, Tmp1Reg + OffsetXC, MaskFscalReg + OffsetXC));
+		//c.xor x9, x12
+		state.emit(rvc(rv64::C_XOR, Tmp2Reg + OffsetXC, MaskFscalReg + OffsetXC));
+		//fmv.d.x f{dst_lo}, x8
+		state.emit(rvi(rv64::FMV_D_X, regLoF(isn.dst), Tmp1Reg));
+		//fmv.d.x f{dst_hi}, x9
+		state.emit(rvi(rv64::FMV_D_X, regHiF(isn.dst), Tmp2Reg));
+	}
+
+	static void v1_FMUL_R(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		isn.src %= RegisterCountFlt;
+		//fmul.d f{dst_lo}, f{dst_lo}, f{src_lo}
+		state.emit(rvi(rv64::FMUL_D, regLoE(isn.dst), regLoE(isn.dst), regLoA(isn.src)));
+		//fmul.d f{dst_hi}, f{dst_hi}, f{src_hi}
+		state.emit(rvi(rv64::FMUL_D, regHiE(isn.dst), regHiE(isn.dst), regHiA(isn.src)));
+	}
+
+	static void v1_FDIV_M(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		//x9 = mem
+		genAddressReg(state, isn);
+		//lw x8, 0(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, Tmp1Reg + OffsetXC));
+		//lw x9, 4(x9)
+		state.emit(rvc(rv64::C_LW, Tmp2Reg + OffsetXC, 16 + Tmp2Reg + OffsetXC));
+		//fcvt.d.w f24, x8
+		state.emit(rvi(rv64::FCVT_D_W, Tmp1RegF, Tmp1Reg));
+		//fcvt.d.w f25, x9
+		state.emit(rvi(rv64::FCVT_D_W, Tmp2RegF, Tmp2Reg));
+		//fmv.x.d x8, f24
+		state.emit(rvi(rv64::FMV_X_D, Tmp1Reg, Tmp1RegF));
+		//fmv.x.d x9, f25
+		state.emit(rvi(rv64::FMV_X_D, Tmp2Reg, Tmp2RegF));
+		//c.and x8, x13
+		state.emit(rvc(rv64::C_AND, Tmp1Reg + OffsetXC, MaskEclear + OffsetXC));
+		//c.and x9, x13
+		state.emit(rvc(rv64::C_AND, Tmp2Reg + OffsetXC, MaskEclear + OffsetXC));
+		//c.or x8, x14
+		state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, MaskEsetLo + OffsetXC));
+		//c.or x9, x15
+		state.emit(rvc(rv64::C_OR, Tmp2Reg + OffsetXC, MaskEsetHi + OffsetXC));
+		//fmv.d.x f24, x8
+		state.emit(rvi(rv64::FMV_D_X, Tmp1RegF, Tmp1Reg));
+		//fmv.d.x f25, x9
+		state.emit(rvi(rv64::FMV_D_X, Tmp2RegF, Tmp2Reg));
+		//fdiv.d f{dst_lo}, f{dst_lo}, f24
+		state.emit(rvi(rv64::FDIV_D, regLoE(isn.dst), regLoE(isn.dst), Tmp1RegF));
+		//fdiv.d f{dst_hi}, f{dst_hi}, f25
+		state.emit(rvi(rv64::FDIV_D, regHiE(isn.dst), regHiE(isn.dst), Tmp2RegF));
+	}
+
+	static void v1_FSQRT_R(HANDLER_ARGS) {
+		isn.dst %= RegisterCountFlt;
+		//fsqrt.d f{dst_lo}, f{dst_lo}
+		state.emit(rvi(rv64::FSQRT_D, regLoE(isn.dst), regLoE(isn.dst)));
+		//fsqrt.d f{dst_hi}, f{dst_hi}
+		state.emit(rvi(rv64::FSQRT_D, regHiE(isn.dst), regHiE(isn.dst)));
+	}
+
+	static void v1_CBRANCH(HANDLER_ARGS) {
+		int reg = isn.dst;
+		int target = state.registerUsage[reg] + 1;
+		int shift = isn.getModCond() + ConditionOffset;
+		int32_t imm = unsigned32ToSigned2sCompl(isn.getImm32());
+		imm |= (1UL << shift);
+		if (ConditionOffset > 0 || shift > 0)
+			imm &= ~(1UL << (shift - 1));
+		//x8 = branchMask
+		emitImm32(state, (int32_t)ConditionMask << shift, Tmp1Reg);
+		//x{dst} += {imm}
+		emitImm32(state, imm, regR(isn.dst), regR(isn.dst), Tmp2Reg);
+		//and x8, x8, x{dst}
+		state.emit(rvi(rv64::AND, Tmp1Reg, Tmp1Reg, regR(isn.dst)));
+		int32_t targetPos = state.instructionOffsets[target];
+		int offset = targetPos - state.codePos;
+		if (offset >= -256) { //C.BEQZ only has a range of 256B
+			//c.beqz x8, {offset}
+			int imm8 = 1; //sign bit is always 1
+			int imm21 = offset & 6; //offset[2:1]
+			int imm5 = (offset >> 5) & 1; //offset[5]
+			int imm43 = offset & 24; //offset[4:3]
+			int imm76 = (offset >> 3) & 24; //offset[7:6]
+			state.emit(rvc(rv64::C_BEQZ, imm8, imm43 + (Tmp1Reg + OffsetXC), imm76 + imm21 + imm5));
+		}
+		else if (offset >= -4096) { //BEQ only has a range of 4KB
+			//beq x8, x0, offset
+			int imm12 = 1 << 11; //sign bit is always 1
+			int imm105 = offset & 2016; //offset[10:5]
+			int imm41 = offset & 30; //offset[4:1]
+			int imm11 = (offset >> 11) & 1; //offset[11]
+			state.emit(rvi(rv64::BEQ, imm41 + imm11, Tmp1Reg, imm12 + imm105));
+		}
+		else {
+			//c.bnez x8, +6
+			state.emit(rvc(rv64::C_BNEZ, Tmp1Reg + OffsetXC, 6));
+			//j targetPos
+			emitJump(state, 0, state.codePos, targetPos);
+			state.codePos += 4;
+		}
+		//mark all registers as used
+		for (unsigned j = 0; j < RegistersCount; ++j) {
+			state.registerUsage[j] = i;
+		}
+	}
+
+	static void v1_CFROUND(HANDLER_ARGS) {
+		int32_t imm = (isn.getImm32() - 2) & 63; //-2 to avoid a later left shift to multiply by 4
+		if (imm != 0) {
+#ifdef __riscv_zbb
+			//rori x8, x{src}, {imm}
+			state.emit(rvi(rv64::RORI, Tmp1Reg, regR(isn.src), imm));
+#else
+			int32_t imml = -imm & 63;
+			//srli x8, x{src}, {imm}
+			state.emit(rvi(rv64::SRLI, Tmp1Reg, regR(isn.src), imm));
+			//slli x9, x{src}, {imml}
+			state.emit(rvi(rv64::SLLI, Tmp2Reg, regR(isn.src), imml));
+			//c.or x8, x9
+			state.emit(rvc(rv64::C_OR, Tmp1Reg + OffsetXC, Tmp2Reg + OffsetXC));
+#endif
+			//c.andi x8, 12
+			state.emit(rvc(rv64::C_ANDI, Tmp1Reg + OffsetXC, 12));
+		}
+		else {
+			//and x8, x{src}, 12
+			state.emit(rvi(rv64::ANDI, Tmp1Reg, regR(isn.src), 12));
+		}
+		//c.add x8, x3
+		state.emit(rvc(rv64::C_ADD, Tmp1Reg, LiteralPoolReg));
+		//c.lw x8, 64(x8)
+		state.emit(rvc(rv64::C_LW, Tmp1Reg + OffsetXC, 8 + Tmp1Reg + OffsetXC));
+		//fsrm x8
+		state.emit(rvi(rv64::FSRM, 0, Tmp1Reg, 0));
+	}
+
+	static void v1_ISTORE(HANDLER_ARGS) {
+		genAddressRegDst(state, isn);
+		//sd x{src}, 0(x9)
+		state.emit(rvi(rv64::SD, 0, Tmp2Reg, regR(isn.src)));
+	}
+
+	static void v1_NOP(HANDLER_ARGS) {
+	}
+}
+
+#include "instruction_weights.hpp"
+
+namespace {
+
+#define INST_HANDLE1(x) REPN(&randomx::v1_##x, WT(x))
+#define INST_HANDLE2(x) REPN(&randomx::v2_##x, WT(x))
+
+	InstructionHandler* opcodeMap1[256] = {
+		INST_HANDLE1(IADD_RS)
+		INST_HANDLE1(IADD_M)
+		INST_HANDLE1(ISUB_R)
+		INST_HANDLE1(ISUB_M)
+		INST_HANDLE1(IMUL_R)
+		INST_HANDLE1(IMUL_M)
+		INST_HANDLE1(IMULH_R)
+		INST_HANDLE1(IMULH_M)
+		INST_HANDLE1(ISMULH_R)
+		INST_HANDLE1(ISMULH_M)
+		INST_HANDLE1(IMUL_RCP)
+		INST_HANDLE1(INEG_R)
+		INST_HANDLE1(IXOR_R)
+		INST_HANDLE1(IXOR_M)
+		INST_HANDLE1(IROR_R)
+		INST_HANDLE1(IROL_R)
+		INST_HANDLE1(ISWAP_R)
+		INST_HANDLE1(FSWAP_R)
+		INST_HANDLE1(FADD_R)
+		INST_HANDLE1(FADD_M)
+		INST_HANDLE1(FSUB_R)
+		INST_HANDLE1(FSUB_M)
+		INST_HANDLE1(FSCAL_R)
+		INST_HANDLE1(FMUL_R)
+		INST_HANDLE1(FDIV_M)
+		INST_HANDLE1(FSQRT_R)
+		INST_HANDLE1(CBRANCH)
+		INST_HANDLE1(CFROUND)
+		INST_HANDLE1(ISTORE)
+		INST_HANDLE1(NOP)
+	};
+}
\ No newline at end of file
diff --git a/src/jit_compiler_rv64.hpp b/src/jit_compiler_rv64.hpp
new file mode 100644
index 00000000..aaae57e3
--- /dev/null
+++ b/src/jit_compiler_rv64.hpp
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include "jit_compiler.hpp"
+
+namespace randomx {
+
+	class Program;
+	struct ProgramConfiguration;
+	class SuperscalarProgram;
+	class Instruction;
+
+	class JitCompilerRV64 {
+	public:
+		JitCompilerRV64();
+		~JitCompilerRV64();
+		void generateProgram(Program&, ProgramConfiguration&);
+		void generateProgramLight(Program&, ProgramConfiguration&, uint32_t);
+		void generateSuperscalarHash(SuperscalarProgram programs[RANDOMX_CACHE_ACCESSES], std::vector<uint64_t>&);
+		void generateDatasetInitCode() {}
+		ProgramFunc* getProgramFunc() {
+			return (ProgramFunc*)entryProgram;
+		}
+		DatasetInitFunc* getDatasetInitFunc() {
+			return (DatasetInitFunc*)entryDataInit;
+		}
+		uint8_t* getCode() {
+			return state.code;
+		}
+		size_t getCodeSize();
+		void enableWriting();
+		void enableExecution();
+		void enableAll();
+	private:
+		CompilerState state;
+		void* entryDataInit;
+		void* entryProgram;
+	};
+}
diff --git a/src/jit_compiler_rv64_static.S b/src/jit_compiler_rv64_static.S
new file mode 100644
index 00000000..5ecb4815
--- /dev/null
+++ b/src/jit_compiler_rv64_static.S
@@ -0,0 +1,1235 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define DECL(x) x
+
+.text
+.option rvc
+
+#include "configuration.h"
+
+.global DECL(randomx_riscv64_literals)
+.global DECL(randomx_riscv64_literals_end)
+.global DECL(randomx_riscv64_data_init)
+.global DECL(randomx_riscv64_fix_data_call)
+.global DECL(randomx_riscv64_prologue)
+.global DECL(randomx_riscv64_loop_begin)
+.global DECL(randomx_riscv64_data_read)
+.global DECL(randomx_riscv64_data_read_light)
+.global DECL(randomx_riscv64_fix_loop_call)
+.global DECL(randomx_riscv64_spad_store)
+.global DECL(randomx_riscv64_spad_store_hardaes)
+.global DECL(randomx_riscv64_spad_store_softaes)
+.global DECL(randomx_riscv64_loop_end)
+.global DECL(randomx_riscv64_fix_continue_loop)
+.global DECL(randomx_riscv64_epilogue)
+.global DECL(randomx_riscv64_softaes)
+.global DECL(randomx_riscv64_program_end)
+.global DECL(randomx_riscv64_ssh_init)
+.global DECL(randomx_riscv64_ssh_load)
+.global DECL(randomx_riscv64_ssh_prefetch)
+.global DECL(randomx_riscv64_ssh_end)
+
+/* The literal pool can fit at most 494 IMUL_RCP literals */
+#if RANDOMX_PROGRAM_SIZE > 494
+    #error RANDOMX_PROGRAM_SIZE larger than 494 is not supported.
+#endif
+
+#define RANDOMX_CACHE_MASK (RANDOMX_ARGON_MEMORY*16-1)
+
+/* shared literal pool: 4 KB */
+    /* space for 256 IMUL_RCP literals -2048 */
+    /* filled by JIT compiler */
+DECL(randomx_riscv64_literals):
+literal_pool:
+    /* SuperscalarHash constants +0 */
+    .dword 6364136223846793005
+    .dword 9298411001130361340
+    .dword 12065312585734608966
+    .dword 9306329213124626780
+    .dword 5281919268842080866
+    .dword 10536153434571861004
+    .dword 3398623926847679864
+    .dword 9549104520008361294
+    /* CFROUND lookup table +64 */
+    .word  0x00000000 /* RTN */
+    .word  0x00000002 /* RDN */
+    .word  0x00000003 /* RUP */
+    .word  0x00000001 /* RTZ */
+    /* mask literals +80,+84,+88,+92,+96,+104 */
+    .word (RANDOMX_SCRATCHPAD_L1-8)
+    .word (RANDOMX_SCRATCHPAD_L2-8)
+    .word (RANDOMX_SCRATCHPAD_L3-64)
+    .word (RANDOMX_DATASET_BASE_SIZE-64)
+    .dword 0x80f0000000000000
+    .dword 0x00ffffffffffffff
+DECL(randomx_riscv64_literals_end):
+    /* E reg. set masks, +112,+120 */
+    .dword 0 /* filled by JIT compiler */
+    .dword 0 /* filled by JIT compiler */
+    /* soft AES table addresses, +128,+136 */
+    .dword 0 /* filled by JIT compiler */
+    .dword 0 /* filled by JIT compiler */
+    /* space for 238 IMUL_RCP literals, +144 */
+    .fill 238,8,0 /* filled by JIT compiler */
+
+/* ================================= */
+/* Dataset init function entry point */
+/* ================================= */
+
+/* Register allocation:
+   ----------------------
+  x0      -> zero
+  x1      -> temp/return address
+  x2      -> stack pointer (sp)
+  x3      -> literal pool pointer
+  x5      -> dataset pointer
+  x6      -> cache pointer
+  x7      -> temp/itemNumber
+  x8-x15  -> SuperscalarHash registers
+  x16     -> itemNumber
+  x17     -> endItem
+  x28-x31 -> temp
+
+  Stack layout:
+  ------------------------
+  sp+
+  0   -> return address
+  8   -> saved x3
+  16  -> saved x8-x9
+  32  -> caller stack
+*/
+DECL(randomx_riscv64_data_init):
+    addi sp, sp, -32
+    /* dataset ptr */
+    mv x5, x11
+    /* cache->memory */
+    ld x6, 0(x10)
+    /* callee saved registers */
+    sd x1, 0(sp)
+    sd x3, 8(sp)
+    /* literal pool */
+    lla x3, literal_pool
+    sd x8, 16(sp)
+    sd x9, 24(sp)
+    /* startItem */
+    mv x16, x12
+    /* endItem */
+    mv x17, x13
+init_item:
+    mv x7, x16
+DECL(randomx_riscv64_fix_data_call):
+    jal superscalar_hash /* JIT compiler will adjust the offset */
+    sd x8, 0(x5)
+    sd x9, 8(x5)
+    sd x10, 16(x5)
+    sd x11, 24(x5)
+    sd x12, 32(x5)
+    sd x13, 40(x5)
+    sd x14, 48(x5)
+    sd x15, 56(x5)
+    addi x5, x5, 64
+    addi x16, x16, 1
+    bltu x16, x17, init_item
+    ld x1, 0(sp)
+    ld x3, 8(sp)
+    ld x8, 16(sp)
+    ld x9, 24(sp)
+    addi sp, sp, 32
+    ret
+
+/* ====================================== */
+/* Program execution function entry point */
+/* ====================================== */
+
+/* Register allocation:
+   ----------------------
+  x0      -> zero
+  x1      -> temp/scratchpad L3 mask
+  x2      -> stack pointer (sp)
+  x3      -> literal pool pointer
+  x5      -> scratchpad pointer
+  x6      -> dataset/cache pointer
+  x7      -> temp/next dataset access
+  x8      -> temp
+  x9      -> temp
+  x10     -> scratchpad L1 mask (0x0000000000003ff8)
+  x11     -> scratchpad L2 mask (0x000000000003fff8)
+  x12     -> FSCAL_R mask       (0x80f0000000000000)
+  x13     -> E reg. clear mask  (0x00ffffffffffffff)
+  x14     -> E reg. set mask    (0x3*00000000******)
+  x15     -> E reg. set mask    (0x3*00000000******)
+  x16-x23 -> VM registers "r0"-"r7"
+  x24     -> iteration counter "ic"
+  x25     -> VM registers "mx", "ma"
+  x26     -> spAddr0
+  x27     -> spAddr1
+  x28-x31 -> temp/literals for IMUL_RCP (4x)
+
+  (Note: We avoid using x4 because it breaks debugging with gdb.)
+
+  f0-f7   -> VM registers "f0"-"f3"
+  f8-f15  -> VM registers "e0"-"e3"
+  f16-f23 -> VM registers "a0"-"a3"
+  f24-f25 -> temp
+  f26-f31 -> literals for IMUL_RCP (6x)
+
+  Stack layout:
+  ------------------------
+  sp+
+  0   -> return address
+  8   -> register file ptr
+  16  -> saved x3-x4
+  32  -> saved x8-x9
+  48  -> saved x18-x27
+  128 -> saved f8-f9
+  144 -> saved f18-f27
+  224 -> caller stack
+*/
+
+DECL(randomx_riscv64_prologue):
+    addi sp, sp, -224
+    /* scratchpad pointer */
+    mv x5, x12
+    /* register file pointer */
+    sd x10, 8(sp)
+    /* callee saved registers */
+    sd x3, 16(sp)
+    sd x8, 32(sp)
+    sd x9, 40(sp)
+    sd x18, 48(sp)
+    sd x19, 56(sp)
+    sd x20, 64(sp)
+    sd x21, 72(sp)
+    sd x22, 80(sp)
+    sd x23, 88(sp)
+    sd x24, 96(sp)
+    sd x25, 104(sp)
+    sd x26, 112(sp)
+    sd x27, 120(sp)
+    fsd f8, 128(sp)
+    fsd f9, 136(sp)
+    fsd f18, 144(sp)
+    fsd f19, 152(sp)
+    fsd f20, 160(sp)
+    fsd f21, 168(sp)
+    fsd f22, 176(sp)
+    fsd f23, 184(sp)
+    fsd f24, 192(sp)
+    fsd f25, 200(sp)
+    fsd f26, 208(sp)
+    fsd f27, 216(sp)
+    /* iteration counter */
+    mv x24, x13
+    /* return address */
+    sd x1, 0(sp)
+    /* literal pool */
+    lla x3, literal_pool
+    /* load (ma, mx) */
+    ld x25, 0(x11)
+    /* dataset ptr */
+    ld x6, 8(x11)
+    /* load dataset mask */
+    lwu x1, 92(x3)
+    /* zero registers r0-r3, load a0-a1 */
+    li x16, 0
+    fld f16, 192(x10)
+    li x17, 0
+    fld f17, 200(x10)
+    srli x7, x25, 32 /* x7 = ma */
+    li x18, 0
+    fld f18, 208(x10)
+    mv x27, x7 /* x27 = ma */
+    li x19, 0
+    fld f19, 216(x10)
+    /* set dataset read address */
+    and x7, x7, x1
+    add x7, x7, x6
+    /* zero registers r4-r7, load a2-a3 */
+    li x20, 0
+    fld f20, 224(x10)
+    li x21, 0
+    fld f21, 232(x10)
+    li x22, 0
+    fld f22, 240(x10)
+    li x23, 0
+    fld f23, 248(x10)
+    /* load L3 mask */
+    lwu x1, 88(x3)
+    /* load scratchpad masks */
+    lwu x10, 80(x3)
+    lwu x11, 84(x3)
+    /* set spAddr0, spAddr1 */
+    and x26, x25, x1
+    and x27, x27, x1
+    add x26, x26, x5
+    add x27, x27, x5
+    /* align L3 mask */
+    addi x1, x1, 56
+    /* FSCAL, E reg. masks */
+    ld x12, 96(x3)
+    ld x13, 104(x3)
+    ld x14, 112(x3)
+    ld x15, 120(x3)
+    /* IMUL_RCP literals */
+    fld f26, 176(x3)
+    fld f27, 184(x3)
+    fld f28, 192(x3)
+    fld f29, 200(x3)
+    fld f30, 208(x3)
+    fld f31, 216(x3)
+
+.balign 4
+DECL(randomx_riscv64_loop_begin):
+loop_begin:
+    /* mix integer registers */
+    ld x8, 0(x26)
+    ld x9, 8(x26)
+    ld x30, 16(x26)
+    ld x31, 24(x26)
+    xor x16, x16, x8
+    ld x8, 32(x26)
+    xor x17, x17, x9
+    ld x9, 40(x26)
+    xor x18, x18, x30
+    ld x30, 48(x26)
+    xor x19, x19, x31
+    ld x31, 56(x26)
+    xor x20, x20, x8
+    lw x8, 0(x27)
+    xor x21, x21, x9
+    lw x9, 4(x27)
+    xor x22, x22, x30
+    lw x30, 8(x27)
+    xor x23, x23, x31
+    lw x31, 12(x27)
+    /* load F registers */
+    fcvt.d.w f0, x8
+    lw x8, 16(x27)
+    fcvt.d.w f1, x9
+    lw x9, 20(x27)
+    fcvt.d.w f2, x30
+    lw x30, 24(x27)
+    fcvt.d.w f3, x31
+    lw x31, 28(x27)
+    fcvt.d.w f4, x8
+    lw x8, 32(x27)
+    fcvt.d.w f5, x9
+    lw x9, 36(x27)
+    fcvt.d.w f6, x30
+    lw x30, 40(x27)
+    fcvt.d.w f7, x31
+    lw x31, 44(x27)
+    /* load E registers */
+    fcvt.d.w f8, x8
+    lw x8, 48(x27)
+    fcvt.d.w f9, x9
+    lw x9, 52(x27)
+    fcvt.d.w f10, x30
+    lw x30, 56(x27)
+    fcvt.d.w f11, x31
+    lw x31, 60(x27)
+    fcvt.d.w f12, x8
+    fmv.x.d x8, f8
+    fcvt.d.w f13, x9
+    fmv.x.d x9, f9
+    fcvt.d.w f14, x30
+    fmv.x.d x30, f10
+    fcvt.d.w f15, x31
+    fmv.x.d x31, f11
+    and x8, x8, x13
+    and x9, x9, x13
+    or x8, x8, x14
+    or x9, x9, x15
+    and x30, x30, x13
+    and x31, x31, x13
+    or x30, x30, x14
+    or x31, x31, x15
+    fmv.d.x f8, x8
+    fmv.d.x f9, x9
+    fmv.d.x f10, x30
+    fmv.d.x f11, x31
+    fmv.x.d x8, f12
+    fmv.x.d x9, f13
+    fmv.x.d x30, f14
+    fmv.x.d x31, f15
+    and x8, x8, x13
+    and x9, x9, x13
+    or x8, x8, x14
+    or x9, x9, x15
+    fmv.d.x f12, x8
+    fmv.d.x f13, x9
+    and x30, x30, x13
+    and x31, x31, x13
+    or x30, x30, x14
+    or x31, x31, x15
+    fmv.d.x f14, x30
+    fmv.d.x f15, x31
+    /* reload clobbered IMUL_RCP regs */
+    ld x28, 144(x3)
+    ld x29, 152(x3)
+    ld x30, 160(x3)
+    ld x31, 168(x3)
+
+DECL(randomx_riscv64_data_read):
+    xor x8, x20, x22 /* JIT compiler will adjust the registers */
+    /* load dataset mask */
+    lwu x1, 92(x3)
+    /* zero-extend x8 */
+#ifdef __riscv_zba
+    zext.w x8, x8
+#else
+    slli x8, x8, 32
+    srli x8, x8, 32
+#endif
+    /* update "mx" */
+    xor x25, x25, x8
+    /* read dataset and update registers */
+    ld x8, 0(x7)
+    ld x9, 8(x7)
+    ld x30, 16(x7)
+    ld x31, 24(x7)
+    xor x16, x16, x8
+    ld x8, 32(x7)
+    xor x17, x17, x9
+    ld x9, 40(x7)
+    xor x18, x18, x30
+    ld x30, 48(x7)
+    xor x19, x19, x31
+    ld x31, 56(x7)
+    xor x20, x20, x8
+    /* calculate the next dataset address */
+    and x7, x25, x1
+    xor x21, x21, x9
+    add x7, x7, x6
+    xor x22, x22, x30
+    /* prefetch - doesn't seem to have any effect */
+    /* ld x0, 0(x7) */
+    xor x23, x23, x31
+    /* swap mx <-> ma */
+#ifdef __riscv_zbb
+    rori x25, x25, 32
+#else
+    srli x9, x25, 32
+    slli x25, x25, 32
+    or x25, x25, x9
+#endif
+
+DECL(randomx_riscv64_data_read_light):
+    xor x8, x20, x22 /* JIT compiler will adjust the registers */
+    /* load dataset offset */
+    lui x9, 0x02000  /* JIT compiler will adjust the immediate */
+    addi x9, x9, -64
+    /* load dataset mask */
+    lwu x1, 92(x3)
+    /* swap mx <-> ma */
+#ifdef __riscv_zbb
+    rori x25, x25, 32
+#else
+    srli x31, x25, 32
+    slli x25, x25, 32
+    or x25, x25, x31
+#endif
+    slli x8, x8, 32
+    /* update "mx" */
+    xor x25, x25, x8
+    /* the next dataset item */
+    and x7, x25, x1
+    srli x7, x7, 6
+    add x7, x7, x9
+DECL(randomx_riscv64_fix_loop_call):
+    jal superscalar_hash /* JIT compiler will adjust the offset */
+    xor x16, x16, x8
+    xor x17, x17, x9
+    xor x18, x18, x10
+    xor x19, x19, x11
+    xor x20, x20, x12
+    xor x21, x21, x13
+    xor x22, x22, x14
+    xor x23, x23, x15
+    /* restore clobbered registers */
+    lwu x10, 80(x3)
+    lwu x11, 84(x3)
+    ld x12, 96(x3)
+    ld x13, 104(x3)
+    ld x14, 112(x3)
+    ld x15, 120(x3)
+
+DECL(randomx_riscv64_spad_store):
+    /* store integer registers */
+    sd x16, 0(x27)
+    sd x17, 8(x27)
+    sd x18, 16(x27)
+    sd x19, 24(x27)
+    sd x20, 32(x27)
+    sd x21, 40(x27)
+    sd x22, 48(x27)
+    sd x23, 56(x27)
+    /* XOR and store f0,e0 */
+    fmv.x.d x8, f0
+    fmv.x.d x9, f8
+    fmv.x.d x30, f1
+    fmv.x.d x31, f9
+    xor x8, x8, x9
+    xor x30, x30, x31
+    sd x8, 0(x26)
+    fmv.d.x f0, x8
+    sd x30, 8(x26)
+    fmv.d.x f1, x30
+    /* XOR and store f1,e1 */
+    fmv.x.d x8, f2
+    fmv.x.d x9, f10
+    fmv.x.d x30, f3
+    fmv.x.d x31, f11
+    xor x8, x8, x9
+    xor x30, x30, x31
+    sd x8, 16(x26)
+    fmv.d.x f2, x8
+    sd x30, 24(x26)
+    fmv.d.x f3, x30
+    /* XOR and store f2,e2 */
+    fmv.x.d x8, f4
+    fmv.x.d x9, f12
+    fmv.x.d x30, f5
+    fmv.x.d x31, f13
+    xor x8, x8, x9
+    xor x30, x30, x31
+    sd x8, 32(x26)
+    fmv.d.x f4, x8
+    sd x30, 40(x26)
+    fmv.d.x f5, x30
+    /* XOR and store f3,e3 */
+    fmv.x.d x8, f6
+    fmv.x.d x9, f14
+    fmv.x.d x30, f7
+    fmv.x.d x31, f15
+    xor x8, x8, x9
+    xor x30, x30, x31
+    sd x8, 48(x26)
+    fmv.d.x f6, x8
+    sd x30, 56(x26)
+    fmv.d.x f7, x30
+
+DECL(randomx_riscv64_spad_store_hardaes):
+    nop /* not implemented */
+
+DECL(randomx_riscv64_spad_store_softaes):
+    /* store integer registers */
+    sd x16, 0(x27)
+    sd x17, 8(x27)
+    sd x18, 16(x27)
+    sd x19, 24(x27)
+    sd x20, 32(x27)
+    sd x21, 40(x27)
+    sd x22, 48(x27)
+    sd x23, 56(x27)
+    /* process f0 with 4 AES rounds */
+    fmv.x.d x8, f8
+    fmv.x.d x10, f9
+    fmv.x.d x30, f0
+    fmv.x.d x31, f1
+    jal softaes_enc
+    fmv.x.d x8, f10
+    fmv.x.d x10, f11
+    jal softaes_enc
+    fmv.x.d x8, f12
+    fmv.x.d x10, f13
+    jal softaes_enc
+    fmv.x.d x8, f14
+    fmv.x.d x10, f15
+    jal softaes_enc
+    sd x30, 0(x26)
+    fmv.d.x f0, x30
+    sd x31, 8(x26)
+    fmv.d.x f1, x31
+    /* process f1 with 4 AES rounds */
+    fmv.x.d x8, f8
+    fmv.x.d x10, f9
+    fmv.x.d x30, f2
+    fmv.x.d x31, f3
+    jal softaes_dec
+    fmv.x.d x8, f10
+    fmv.x.d x10, f11
+    jal softaes_dec
+    fmv.x.d x8, f12
+    fmv.x.d x10, f13
+    jal softaes_dec
+    fmv.x.d x8, f14
+    fmv.x.d x10, f15
+    jal softaes_dec
+    sd x30, 16(x26)
+    fmv.d.x f2, x30
+    sd x31, 24(x26)
+    fmv.d.x f3, x31
+    /* process f2 with 4 AES rounds */
+    fmv.x.d x8, f8
+    fmv.x.d x10, f9
+    fmv.x.d x30, f4
+    fmv.x.d x31, f5
+    jal softaes_enc
+    fmv.x.d x8, f10
+    fmv.x.d x10, f11
+    jal softaes_enc
+    fmv.x.d x8, f12
+    fmv.x.d x10, f13
+    jal softaes_enc
+    fmv.x.d x8, f14
+    fmv.x.d x10, f15
+    jal softaes_enc
+    sd x30, 32(x26)
+    fmv.d.x f4, x30
+    sd x31, 40(x26)
+    fmv.d.x f5, x31
+    /* process f3 with 4 AES rounds */
+    fmv.x.d x8, f8
+    fmv.x.d x10, f9
+    fmv.x.d x30, f6
+    fmv.x.d x31, f7
+    jal softaes_dec
+    fmv.x.d x8, f10
+    fmv.x.d x10, f11
+    jal softaes_dec
+    fmv.x.d x8, f12
+    fmv.x.d x10, f13
+    jal softaes_dec
+    fmv.x.d x8, f14
+    fmv.x.d x10, f15
+    jal softaes_dec
+    sd x30, 48(x26)
+    fmv.d.x f6, x30
+    sd x31, 56(x26)
+    fmv.d.x f7, x31
+    /* restore clobbered registers */
+    lwu x10, 80(x3)
+    lwu x11, 84(x3)
+    ld x12, 96(x3)
+    ld x13, 104(x3)
+    ld x14, 112(x3)
+    ld x15, 120(x3)
+
+DECL(randomx_riscv64_loop_end):
+    xor x26, x16, x18 /* JIT compiler will adjust the registers */
+    /* load L3 mask */
+    lwu x1, 88(x3)
+    addi x24, x24, -1
+    srli x27, x26, 32
+    /* set spAddr0, spAddr1 */
+    and x26, x26, x1
+    and x27, x27, x1
+    add x26, x26, x5
+    add x27, x27, x5
+    /* align L3 mask */
+    addi x1, x1, 56
+    /* conditional branch doesn't have sufficient range */
+    j condition_check
+DECL(randomx_riscv64_fix_continue_loop):
+continue_loop:
+    .word 0 /* JIT compiler will write a jump to loop_begin  */
+condition_check:
+    bnez x24, continue_loop
+
+DECL(randomx_riscv64_epilogue):
+    /* restore callee saved registers */
+    ld x10, 8(sp)
+    ld x1, 0(sp)
+    ld x3, 16(sp)
+    ld x8, 32(sp)
+    ld x9, 40(sp)
+    ld x24, 96(sp)
+    ld x25, 104(sp)
+    ld x26, 112(sp)
+    ld x27, 120(sp)
+    fld f18, 144(sp)
+    fld f19, 152(sp)
+    fld f20, 160(sp)
+    fld f21, 168(sp)
+    fld f22, 176(sp)
+    fld f23, 184(sp)
+    fld f24, 192(sp)
+    fld f25, 200(sp)
+    fld f26, 208(sp)
+    fld f27, 216(sp)
+    /* save VM registers */
+    sd x16, 0(x10)
+    sd x17, 8(x10)
+    sd x18, 16(x10)
+    sd x19, 24(x10)
+    sd x20, 32(x10)
+    sd x21, 40(x10)
+    sd x22, 48(x10)
+    sd x23, 56(x10)
+    fsd f0, 64(x10)
+    fsd f1, 72(x10)
+    fsd f2, 80(x10)
+    fsd f3, 88(x10)
+    fsd f4, 96(x10)
+    fsd f5, 104(x10)
+    fsd f6, 112(x10)
+    fsd f7, 120(x10)
+    fsd f8, 128(x10)
+    fsd f9, 136(x10)
+    fsd f10, 144(x10)
+    fsd f11, 152(x10)
+    fsd f12, 160(x10)
+    fsd f13, 168(x10)
+    fsd f14, 176(x10)
+    fsd f15, 184(x10)
+    /* restore callee saved registers */
+    ld x18, 48(sp)
+    ld x19, 56(sp)
+    ld x20, 64(sp)
+    ld x21, 72(sp)
+    ld x22, 80(sp)
+    ld x23, 88(sp)
+    fld f8, 128(sp)
+    fld f9, 136(sp)
+    /* restore stack pointer */
+    addi sp, sp, 224
+    /* return */
+    ret
+
+/*
+    Soft AES subroutines
+        in:
+                  x3 = literal pool
+             x8, x10 = round key
+            x30, x31 = plaintext
+        out:
+            x30, x31 = ciphertext
+        clobbers:
+             x8-x11 (limbs)
+            x12-x13 (LUTs)
+            x14-x15 (temp)
+*/
+DECL(randomx_riscv64_softaes):
+softaes_enc:
+    /* enc. lookup table */
+    ld x13, 128(x3)
+
+    /* load the round key into x8, x9, x10, x11 */
+    srli x9, x8, 32
+    srli x11, x10, 32
+#ifdef __riscv_zba
+    zext.w x8, x8
+    zext.w x10, x10
+#else
+    slli x8, x8, 32
+    slli x10, x10, 32
+    srli x8, x8, 32
+    srli x10, x10, 32
+#endif
+
+    /* byte 0 */
+    zext.b x14, x30
+    srli x30, x30, 8
+    addi x12, x13, -2048
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, -2048(x14)
+
+    /* byte 1 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x8, x8, x14
+
+    /* byte 2 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x11, x11, x15
+
+    /* byte 3 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x10, x10, x14
+
+    /* byte 4 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x9, x9, x15
+
+    /* byte 5 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x9, x9, x14
+
+    /* byte 6 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x8, x8, x15
+
+    /* byte 7 */
+    zext.b x15, x30
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x11, x11, x14
+
+    /* byte 8 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x10, x10, x15
+
+    /* byte 9 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x10, x10, x14
+
+    /* byte 10 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x9, x9, x15
+
+    /* byte 11 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x8, x8, x14
+
+    /* byte 12 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x11, x11, x15
+
+    /* byte 13 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x11, x11, x14
+
+    /* byte 14 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x10, x10, x15
+
+    /* byte 15 */
+    zext.b x15, x31
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x9, x9, x14
+
+    slli x11, x11, 32
+    slli x9, x9, 32
+    or x30, x8, x9
+    or x31, x10, x11
+    xor x30, x30, x15
+
+    ret
+
+softaes_dec:
+    /* dec. lookup table */
+    ld x13, 136(x3)
+
+    /* load the round key into x8, x9, x10, x11 */
+    srli x9, x8, 32
+    srli x11, x10, 32
+#ifdef __riscv_zba
+    zext.w x8, x8
+    zext.w x10, x10
+#else
+    slli x8, x8, 32
+    slli x10, x10, 32
+    srli x8, x8, 32
+    srli x10, x10, 32
+#endif
+
+    /* byte 0 */
+    zext.b x14, x30
+    srli x30, x30, 8
+    addi x12, x13, -2048
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, -2048(x14)
+
+    /* byte 1 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x8, x8, x14
+
+    /* byte 2 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x9, x9, x15
+
+    /* byte 3 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x10, x10, x14
+
+    /* byte 4 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x11, x11, x15
+
+    /* byte 5 */
+    zext.b x15, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x9, x9, x14
+
+    /* byte 6 */
+    zext.b x14, x30
+    srli x30, x30, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x10, x10, x15
+
+    /* byte 7 */
+    zext.b x15, x30
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x11, x11, x14
+
+    /* byte 8 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x8, x8, x15
+
+    /* byte 9 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x10, x10, x14
+
+    /* byte 10 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x11, x11, x15
+
+    /* byte 11 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x8, x8, x14
+
+    /* byte 12 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x12
+#else
+    slli x14, x14, 2
+    add x14, x14, x12
+#endif
+    lwu x14, 0(x14)
+    xor x9, x9, x15
+
+    /* byte 13 */
+    zext.b x15, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x15, x15, x12
+#else
+    slli x15, x15, 2
+    add x15, x15, x12
+#endif
+    lwu x15, 1024(x15)
+    xor x11, x11, x14
+
+    /* byte 14 */
+    zext.b x14, x31
+    srli x31, x31, 8
+#ifdef __riscv_zba
+    sh2add x14, x14, x13
+#else
+    slli x14, x14, 2
+    add x14, x14, x13
+#endif
+    lwu x14, 0(x14)
+    xor x8, x8, x15
+
+    /* byte 15 */
+    zext.b x15, x31
+#ifdef __riscv_zba
+    sh2add x15, x15, x13
+#else
+    slli x15, x15, 2
+    add x15, x15, x13
+#endif
+    lwu x15, 1024(x15)
+    xor x9, x9, x14
+
+    slli x11, x11, 32
+    slli x9, x9, 32
+    or x30, x8, x9
+    or x31, x10, x11
+    xor x31, x31, x15
+
+    ret
+
+DECL(randomx_riscv64_program_end):
+    nop
+
+
+/* literal pool for SuperscalarHash */
+    /* space for remaining IMUL_RCP literals */
+ssh_literal_pool:
+    /* space for 256 IMUL_RCP literals */
+    .fill 256,8,0
+
+/*
+    SuperscalarHash subroutine
+        in:
+            x3 = literal pool
+            x6 = cache
+            x7 = itemNumber
+        out:
+            x8-x15 = 64-byte hash
+        clobbers:
+            x7, x28-x31
+*/
+DECL(randomx_riscv64_ssh_init):
+superscalar_hash:
+    ld x30, 0(x3) /* superscalarMul0 */
+    addi x8, x7, 1
+    ld x9, 8(x3)
+    li x31, RANDOMX_CACHE_MASK
+    ld x10, 16(x3)
+    ld x11, 24(x3)
+    mul x8, x8, x30
+    ld x12, 32(x3)
+    ld x13, 40(x3)
+    lla x30, ssh_literal_pool
+    ld x14, 48(x3)
+    and x7, x7, x31
+    ld x15, 56(x3)
+    slli x7, x7, 6
+    xor x9, x9, x8
+    add x7, x7, x6
+    xor x10, x10, x8
+    /* load the first IMUL_RCP literal */
+    ld x31, 2040(x30)
+    xor x11, x11, x8
+    xor x12, x12, x8
+    xor x13, x13, x8
+    xor x14, x14, x8
+    xor x15, x15, x8
+
+DECL(randomx_riscv64_ssh_load):
+    ld x28, 0(x7)
+    ld x29, 8(x7)
+    xor x8, x8, x28
+    ld x28, 16(x7)
+    xor x9, x9, x29
+    ld x29, 24(x7)
+    xor x10, x10, x28
+    ld x28, 32(x7)
+    xor x11, x11, x29
+    ld x29, 40(x7)
+    xor x12, x12, x28
+    ld x28, 48(x7)
+    xor x13, x13, x29
+    ld x29, 56(x7)
+    xor x14, x14, x28
+    li x7, RANDOMX_CACHE_MASK
+    xor x15, x15, x29
+
+DECL(randomx_riscv64_ssh_prefetch):
+    and x7, x8, x7   /* JIT compiler will adjust the register */
+    slli x7, x7, 6
+    add x7, x7, x6
+    /* prefetch - doesn't seem to have any effect */
+    /* ld x0, 0(x7) */
+
+DECL(randomx_riscv64_ssh_end):
+    nop
diff --git a/src/jit_compiler_rv64_static.hpp b/src/jit_compiler_rv64_static.hpp
new file mode 100644
index 00000000..656623c7
--- /dev/null
+++ b/src/jit_compiler_rv64_static.hpp
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2023 tevador <tevador@gmail.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+	* Redistributions of source code must retain the above copyright
+	  notice, this list of conditions and the following disclaimer.
+	* Redistributions in binary form must reproduce the above copyright
+	  notice, this list of conditions and the following disclaimer in the
+	  documentation and/or other materials provided with the distribution.
+	* Neither the name of the copyright holder nor the
+	  names of its contributors may be used to endorse or promote products
+	  derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#pragma once
+
+extern "C" {
+	void randomx_riscv64_literals();
+	void randomx_riscv64_literals_end();
+	void randomx_riscv64_data_init();
+	void randomx_riscv64_fix_data_call();
+	void randomx_riscv64_prologue();
+	void randomx_riscv64_loop_begin();
+	void randomx_riscv64_data_read();
+	void randomx_riscv64_data_read_light();
+	void randomx_riscv64_fix_loop_call();
+	void randomx_riscv64_spad_store();
+	void randomx_riscv64_spad_store_hardaes();
+	void randomx_riscv64_spad_store_softaes();
+	void randomx_riscv64_loop_end();
+	void randomx_riscv64_fix_continue_loop();
+	void randomx_riscv64_epilogue();
+	void randomx_riscv64_softaes();
+	void randomx_riscv64_program_end();
+	void randomx_riscv64_ssh_init();
+	void randomx_riscv64_ssh_load();
+	void randomx_riscv64_ssh_prefetch();
+	void randomx_riscv64_ssh_end();
+}
diff --git a/src/tests/riscv64_zba.s b/src/tests/riscv64_zba.s
new file mode 100644
index 00000000..e1947e7a
--- /dev/null
+++ b/src/tests/riscv64_zba.s
@@ -0,0 +1,9 @@
+/* RISC-V - test if the Zba extension is present */
+
+.text
+.global main
+
+main:
+    sh1add x6, x6, x7
+    li x10, 0
+    ret
diff --git a/src/tests/riscv64_zbb.s b/src/tests/riscv64_zbb.s
new file mode 100644
index 00000000..d922043f
--- /dev/null
+++ b/src/tests/riscv64_zbb.s
@@ -0,0 +1,9 @@
+/* RISC-V - test if the Zbb extension is present */
+
+.text
+.global main
+
+main:
+    ror x6, x6, x7
+    li x10, 0
+    ret

From f72101aa2c54cf15dba8f7aebb24fa5ce2961de1 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Thu, 19 Oct 2023 15:16:47 +0200
Subject: [PATCH 08/13] ARM64 JIT: don't use `x18` register

---
 src/jit_compiler_a64.cpp      | 54 +++++++++----------
 src/jit_compiler_a64_static.S | 98 +++++++++++++++++------------------
 2 files changed, 75 insertions(+), 77 deletions(-)

diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp
index 91e31d64..0c557662 100644
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@@ -130,8 +130,8 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
 	// and w16, w10, ScratchpadL3Mask64
 	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 
-	// and w17, w18, ScratchpadL3Mask64
-	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+	// and w17, w20, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 
 	codePos = PrologueSize;
 	literalPos = ImulRcpLiteralsEnd;
@@ -149,16 +149,16 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
 	}
 
 	// Update spMix2
-	// eor w18, config.readReg2, config.readReg3
-	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+	// eor w20, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
 
 	// Jump back to the main loop
 	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end) - ((uint8_t*)randomx_program_aarch64)) - codePos;
 	emit32(ARMV8A::B | (offset / 4), code, codePos);
 
-	// and w18, w18, CacheLineAlignMask
+	// and w20, w20, CacheLineAlignMask
 	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask1) - ((uint8_t*)randomx_program_aarch64));
-	emit32(0x121A0000 | 18 | (18 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
+	emit32(0x121A0000 | 20 | (20 << 5) | ((Log2(RANDOMX_DATASET_BASE_SIZE) - 7) << 10), code, codePos);
 
 	// and w10, w10, CacheLineAlignMask
 	codePos = (((uint8_t*)randomx_program_aarch64_cacheline_align_mask2) - ((uint8_t*)randomx_program_aarch64));
@@ -181,8 +181,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
 	// and w16, w10, ScratchpadL3Mask64
 	emit32(0x121A0000 | 16 | (10 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 
-	// and w17, w18, ScratchpadL3Mask64
-	emit32(0x121A0000 | 17 | (18 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
+	// and w17, w20, ScratchpadL3Mask64
+	emit32(0x121A0000 | 17 | (20 << 5) | ((Log2(RANDOMX_SCRATCHPAD_L3) - 7) << 10), code, codePos);
 
 	codePos = PrologueSize;
 	literalPos = ImulRcpLiteralsEnd;
@@ -200,8 +200,8 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
 	}
 
 	// Update spMix2
-	// eor w18, config.readReg2, config.readReg3
-	emit32(ARMV8A::EOR32 | 18 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
+	// eor w20, config.readReg2, config.readReg3
+	emit32(ARMV8A::EOR32 | 20 | (IntRegMap[config.readReg2] << 5) | (IntRegMap[config.readReg3] << 16), code, codePos);
 
 	// Jump back to the main loop
 	const uint32_t offset = (((uint8_t*)randomx_program_aarch64_vm_instructions_end_light) - ((uint8_t*)randomx_program_aarch64)) - codePos;
@@ -434,7 +434,7 @@ void JitCompilerA64::emitAddImmediate(uint32_t dst, uint32_t src, uint32_t imm,
 	}
 	else
 	{
-		constexpr uint32_t tmp_reg = 18;
+		constexpr uint32_t tmp_reg = 20;
 		emitMovImmediate(tmp_reg, imm, code, k);
 
 		// add dst, src, tmp_reg
@@ -483,7 +483,7 @@ void JitCompilerA64::emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* co
 	uint32_t k = codePos;
 
 	uint32_t imm = instr.getImm32();
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 19;
 
 	imm &= instr.getModMem() ? (RANDOMX_SCRATCHPAD_L1 - 1) : (RANDOMX_SCRATCHPAD_L2 - 1);
 	emitAddImmediate(tmp_reg, src, imm, code, k);
@@ -537,7 +537,7 @@ void JitCompilerA64::h_IADD_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// add dst, dst, tmp_reg
@@ -575,7 +575,7 @@ void JitCompilerA64::h_ISUB_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// sub dst, dst, tmp_reg
@@ -594,7 +594,7 @@ void JitCompilerA64::h_IMUL_R(Instruction& instr, uint32_t& codePos)
 
 	if (src == dst)
 	{
-		src = 18;
+		src = 20;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}
 
@@ -612,7 +612,7 @@ void JitCompilerA64::h_IMUL_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// sub dst, dst, tmp_reg
@@ -643,7 +643,7 @@ void JitCompilerA64::h_IMULH_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// umulh dst, dst, tmp_reg
@@ -674,7 +674,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// smulh dst, dst, tmp_reg
@@ -692,7 +692,7 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
 
 	uint32_t k = codePos;
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	const uint32_t dst = IntRegMap[instr.dst];
 
 	constexpr uint64_t N = 1ULL << 63;
@@ -711,9 +711,9 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
 	literalPos -= sizeof(uint64_t);
 	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
 
-	if (literal_id < 13)
+	if (literal_id < 12)
 	{
-		static constexpr uint32_t literal_regs[13] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 20 << 16, 11 << 16, 0 };
+		static constexpr uint32_t literal_regs[12] = { 30 << 16, 29 << 16, 28 << 16, 27 << 16, 26 << 16, 25 << 16, 24 << 16, 23 << 16, 22 << 16, 21 << 16, 11 << 16, 0 };
 
 		// mul dst, dst, literal_reg
 		emit32(ARMV8A::MUL | dst | (dst << 5) | literal_regs[literal_id], code, k);
@@ -751,7 +751,7 @@ void JitCompilerA64::h_IXOR_R(Instruction& instr, uint32_t& codePos)
 
 	if (src == dst)
 	{
-		src = 18;
+		src = 20;
 		emitMovImmediate(src, instr.getImm32(), code, k);
 	}
 
@@ -769,7 +769,7 @@ void JitCompilerA64::h_IXOR_M(Instruction& instr, uint32_t& codePos)
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emitMemLoad<tmp_reg>(dst, src, instr, code, k);
 
 	// eor dst, dst, tmp_reg
@@ -807,7 +807,7 @@ void JitCompilerA64::h_IROL_R(Instruction& instr, uint32_t& codePos)
 
 	if (src != dst)
 	{
-		constexpr uint32_t tmp_reg = 18;
+		constexpr uint32_t tmp_reg = 20;
 
 		// sub tmp_reg, xzr, src
 		emit32(ARMV8A::SUB | tmp_reg | (31 << 5) | (src << 16), code, k);
@@ -835,7 +835,7 @@ void JitCompilerA64::h_ISWAP_R(Instruction& instr, uint32_t& codePos)
 
 	uint32_t k = codePos;
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	emit32(ARMV8A::MOV_REG | tmp_reg | (dst << 16), code, k);
 	emit32(ARMV8A::MOV_REG | dst | (src << 16), code, k);
 	emit32(ARMV8A::MOV_REG | src | (tmp_reg << 16), code, k);
@@ -984,7 +984,7 @@ void JitCompilerA64::h_CFROUND(Instruction& instr, uint32_t& codePos)
 
 	const uint32_t src = IntRegMap[instr.src];
 
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 	constexpr uint32_t fpcr_tmp_reg = 8;
 
 	// ror tmp_reg, src, imm
@@ -1008,7 +1008,7 @@ void JitCompilerA64::h_ISTORE(Instruction& instr, uint32_t& codePos)
 
 	const uint32_t src = IntRegMap[instr.src];
 	const uint32_t dst = IntRegMap[instr.dst];
-	constexpr uint32_t tmp_reg = 18;
+	constexpr uint32_t tmp_reg = 20;
 
 	uint32_t imm = instr.getImm32();
 
diff --git a/src/jit_compiler_a64_static.S b/src/jit_compiler_a64_static.S
index 4886fcf3..bc146133 100644
--- a/src/jit_compiler_a64_static.S
+++ b/src/jit_compiler_a64_static.S
@@ -74,9 +74,9 @@
 # x15 -> "r7"
 # x16 -> spAddr0
 # x17 -> spAddr1
-# x18 -> temporary
+# x18 -> unused (platform register, don't touch it)
 # x19 -> temporary
-# x20 -> literal for IMUL_RCP
+# x20 -> temporary
 # x21 -> literal for IMUL_RCP
 # x22 -> literal for IMUL_RCP
 # x23 -> literal for IMUL_RCP
@@ -111,7 +111,7 @@ DECL(randomx_program_aarch64):
 	# Save callee-saved registers
 	sub	sp, sp, 192
 	stp	x16, x17, [sp]
-	stp	x18, x19, [sp, 16]
+	str	x19, [sp, 16]
 	stp	x20, x21, [sp, 32]
 	stp	x22, x23, [sp, 48]
 	stp	x24, x25, [sp, 64]
@@ -166,7 +166,6 @@ DECL(randomx_program_aarch64):
 	# Read literals
 	ldr	x0, literal_x0
 	ldr	x11, literal_x11
-	ldr	x20, literal_x20
 	ldr	x21, literal_x21
 	ldr	x22, literal_x22
 	ldr	x23, literal_x23
@@ -198,11 +197,11 @@ DECL(randomx_program_aarch64):
 DECL(randomx_program_aarch64_main_loop):
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
-	lsr	x18, x10, 32
+	lsr	x20, x10, 32
 
 	# Actual mask will be inserted by JIT compiler
 	and	w16, w10, 1
-	and	w17, w18, 1
+	and	w17, w20, 1
 
 	# x16 = scratchpad + spAddr0
 	# x17 = scratchpad + spAddr1
@@ -210,31 +209,31 @@ DECL(randomx_program_aarch64_main_loop):
 	add	x17, x17, x2
 
 	# xor integer registers with scratchpad data (spAddr0)
-	ldp	x18, x19, [x16]
-	eor	x4, x4, x18
+	ldp	x20, x19, [x16]
+	eor	x4, x4, x20
 	eor	x5, x5, x19
-	ldp	x18, x19, [x16, 16]
-	eor	x6, x6, x18
+	ldp	x20, x19, [x16, 16]
+	eor	x6, x6, x20
 	eor	x7, x7, x19
-	ldp	x18, x19, [x16, 32]
-	eor	x12, x12, x18
+	ldp	x20, x19, [x16, 32]
+	eor	x12, x12, x20
 	eor	x13, x13, x19
-	ldp	x18, x19, [x16, 48]
-	eor	x14, x14, x18
+	ldp	x20, x19, [x16, 48]
+	eor	x14, x14, x20
 	eor	x15, x15, x19
 
 	# Load group F registers (spAddr1)
-	ldpsw	x18, x19, [x17]
-	ins	v16.d[0], x18
+	ldpsw	x20, x19, [x17]
+	ins	v16.d[0], x20
 	ins	v16.d[1], x19
-	ldpsw	x18, x19, [x17, 8]
-	ins	v17.d[0], x18
+	ldpsw	x20, x19, [x17, 8]
+	ins	v17.d[0], x20
 	ins	v17.d[1], x19
-	ldpsw	x18, x19, [x17, 16]
-	ins	v18.d[0], x18
+	ldpsw	x20, x19, [x17, 16]
+	ins	v18.d[0], x20
 	ins	v18.d[1], x19
-	ldpsw	x18, x19, [x17, 24]
-	ins	v19.d[0], x18
+	ldpsw	x20, x19, [x17, 24]
+	ins	v19.d[0], x20
 	ins	v19.d[1], x19
 	scvtf	v16.2d, v16.2d
 	scvtf	v17.2d, v17.2d
@@ -242,17 +241,17 @@ DECL(randomx_program_aarch64_main_loop):
 	scvtf	v19.2d, v19.2d
 
 	# Load group E registers (spAddr1)
-	ldpsw	x18, x19, [x17, 32]
-	ins	v20.d[0], x18
+	ldpsw	x20, x19, [x17, 32]
+	ins	v20.d[0], x20
 	ins	v20.d[1], x19
-	ldpsw	x18, x19, [x17, 40]
-	ins	v21.d[0], x18
+	ldpsw	x20, x19, [x17, 40]
+	ins	v21.d[0], x20
 	ins	v21.d[1], x19
-	ldpsw	x18, x19, [x17, 48]
-	ins	v22.d[0], x18
+	ldpsw	x20, x19, [x17, 48]
+	ins	v22.d[0], x20
 	ins	v22.d[1], x19
-	ldpsw	x18, x19, [x17, 56]
-	ins	v23.d[0], x18
+	ldpsw	x20, x19, [x17, 56]
+	ins	v23.d[0], x20
 	ins	v23.d[1], x19
 	scvtf	v20.2d, v20.2d
 	scvtf	v21.2d, v21.2d
@@ -276,7 +275,6 @@ DECL(randomx_program_aarch64_vm_instructions):
 
 literal_x0:  .fill 1,8,0
 literal_x11: .fill 1,8,0
-literal_x20: .fill 1,8,0
 literal_x21: .fill 1,8,0
 literal_x22: .fill 1,8,0
 literal_x23: .fill 1,8,0
@@ -312,17 +310,17 @@ DECL(randomx_program_aarch64_vm_instructions_end):
 	lsr	x10, x9, 32
 
 	# mx ^= r[readReg2] ^ r[readReg3];
-	eor	x9, x9, x18
+	eor	x9, x9, x20
 
 	# Calculate dataset pointer for dataset prefetch
-	mov	w18, w9
+	mov	w20, w9
 DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# Actual mask will be inserted by JIT compiler
-	and	x18, x18, 1
-	add	x18, x18, x1
+	and	x20, x20, 1
+	add	x20, x20, x1
 
 	# Prefetch dataset data
-	prfm	pldl2strm, [x18]
+	prfm	pldl2strm, [x20]
 
 	# mx <-> ma
 	ror	x9, x9, 32
@@ -335,17 +333,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
 DECL(randomx_program_aarch64_xor_with_dataset_line):
 rx_program_xor_with_dataset_line:
 	# xor integer registers with dataset data
-	ldp	x18, x19, [x10]
-	eor	x4, x4, x18
+	ldp	x20, x19, [x10]
+	eor	x4, x4, x20
 	eor	x5, x5, x19
-	ldp	x18, x19, [x10, 16]
-	eor	x6, x6, x18
+	ldp	x20, x19, [x10, 16]
+	eor	x6, x6, x20
 	eor	x7, x7, x19
-	ldp	x18, x19, [x10, 32]
-	eor	x12, x12, x18
+	ldp	x20, x19, [x10, 32]
+	eor	x12, x12, x20
 	eor	x13, x13, x19
-	ldp	x18, x19, [x10, 48]
-	eor	x14, x14, x18
+	ldp	x20, x19, [x10, 48]
+	eor	x14, x14, x20
 	eor	x15, x15, x19
 
 DECL(randomx_program_aarch64_update_spMix1):
@@ -388,7 +386,7 @@ DECL(randomx_program_aarch64_update_spMix1):
 
 	# Restore callee-saved registers
 	ldp	x16, x17, [sp]
-	ldp	x18, x19, [sp, 16]
+	ldr	x19, [sp, 16]
 	ldp	x20, x21, [sp, 32]
 	ldp	x22, x23, [sp, 48]
 	ldp	x24, x25, [sp, 64]
@@ -409,7 +407,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
 	stp	x2, x30, [sp, 80]
 
 	# mx ^= r[readReg2] ^ r[readReg3];
-	eor	x9, x9, x18
+	eor	x9, x9, x20
 
 	# mx <-> ma
 	ror	x9, x9, 32
@@ -451,8 +449,8 @@ DECL(randomx_program_aarch64_light_dataset_offset):
 # x3 -> end item
 
 DECL(randomx_init_dataset_aarch64):
-	# Save x30 (return address)
-	str	x30, [sp, -16]!
+	# Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address)
+	stp	x20, x30, [sp, -16]!
 
 	# Load pointer to cache memory
 	ldr	x0, [x0]
@@ -464,8 +462,8 @@ DECL(randomx_init_dataset_aarch64_main_loop):
 	cmp	x2, x3
 	bne	DECL(randomx_init_dataset_aarch64_main_loop)
 
-	# Restore x30 (return address)
-	ldr	x30, [sp], 16
+	# Restore x20 and x30
+	ldp	x20, x30, [sp], 16
 
 	ret
 

From 8f91d31b8b055bbece3f88b9d88cfe8c9cfc8133 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Fri, 20 Oct 2023 09:04:35 +0200
Subject: [PATCH 09/13] Fixed UB in ARM64 JIT compiler

Fixed unaligned memory writes
---
 src/jit_compiler_a64.cpp | 3 ++-
 src/jit_compiler_a64.hpp | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp
index 0c557662..75ea8ccd 100644
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@@ -709,7 +709,8 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
 	const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t);
 
 	literalPos -= sizeof(uint64_t);
-	*(uint64_t*)(code + literalPos) = (q << shift) + ((r << shift) / divisor);
+	const uint64_t randomx_reciprocal = (q << shift) + ((r << shift) / divisor);
+	memcpy(code + literalPos, &randomx_reciprocal, sizeof(randomx_reciprocal));
 
 	if (literal_id < 12)
 	{
diff --git a/src/jit_compiler_a64.hpp b/src/jit_compiler_a64.hpp
index a02824ff..f8484c08 100644
--- a/src/jit_compiler_a64.hpp
+++ b/src/jit_compiler_a64.hpp
@@ -81,7 +81,7 @@ namespace randomx {
 
 		static void emit64(uint64_t val, uint8_t* code, uint32_t& codePos)
 		{
-			*(uint64_t*)(code + codePos) = val;
+			memcpy(code + codePos, &val, sizeof(val));
 			codePos += sizeof(val);
 		}
 

From 5c49ab12a071df4d0fd4b5f0d91c4c0c3180b74d Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Fri, 20 Oct 2023 10:54:25 +0200
Subject: [PATCH 10/13] Optimized randomx_reciprocal

Also limited it to 32 bit because it's supposed to work only with 32-bit values, according to the specs.
---
 src/assembly_generator_x86.cpp |  2 +-
 src/bytecode_machine.cpp       |  2 +-
 src/jit_compiler_a64.cpp       | 19 ++++---------------
 src/jit_compiler_rv64.cpp      |  2 +-
 src/jit_compiler_x86.cpp       |  2 +-
 src/reciprocal.c               | 34 +++++++++++++---------------------
 src/reciprocal.h               |  4 ++--
 src/tests/perf-simulation.cpp  |  2 +-
 8 files changed, 24 insertions(+), 43 deletions(-)

diff --git a/src/assembly_generator_x86.cpp b/src/assembly_generator_x86.cpp
index e7e5258b..1ce31dd5 100644
--- a/src/assembly_generator_x86.cpp
+++ b/src/assembly_generator_x86.cpp
@@ -445,7 +445,7 @@ namespace randomx {
 	}
 
 	void AssemblyGeneratorX86::h_IMUL_RCP(Instruction& instr, int i) {
-		uint64_t divisor = instr.getImm32();
+		const uint32_t divisor = instr.getImm32();
 		if (!isZeroOrPowerOf2(divisor)) {
 			registerUsage[instr.dst] = i;
 			asmCode << "\tmov rax, " << randomx_reciprocal(divisor) << std::endl;
diff --git a/src/bytecode_machine.cpp b/src/bytecode_machine.cpp
index 7d8e902d..1d00d095 100644
--- a/src/bytecode_machine.cpp
+++ b/src/bytecode_machine.cpp
@@ -243,7 +243,7 @@ namespace randomx {
 		}
 
 		if (opcode < ceil_IMUL_RCP) {
-			uint64_t divisor = instr.getImm32();
+			const uint32_t divisor = instr.getImm32();
 			if (!isZeroOrPowerOf2(divisor)) {
 				auto dst = instr.dst % RegistersCount;
 				ibc.type = InstructionType::IMUL_R;
diff --git a/src/jit_compiler_a64.cpp b/src/jit_compiler_a64.cpp
index 75ea8ccd..5be8f6e4 100644
--- a/src/jit_compiler_a64.cpp
+++ b/src/jit_compiler_a64.cpp
@@ -686,7 +686,7 @@ void JitCompilerA64::h_ISMULH_M(Instruction& instr, uint32_t& codePos)
 
 void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
 {
-	const uint64_t divisor = instr.getImm32();
+	const uint32_t divisor = instr.getImm32();
 	if (isZeroOrPowerOf2(divisor))
 		return;
 
@@ -695,22 +695,11 @@ void JitCompilerA64::h_IMUL_RCP(Instruction& instr, uint32_t& codePos)
 	constexpr uint32_t tmp_reg = 20;
 	const uint32_t dst = IntRegMap[instr.dst];
 
-	constexpr uint64_t N = 1ULL << 63;
-	const uint64_t q = N / divisor;
-	const uint64_t r = N % divisor;
-#ifdef __GNUC__
-	const uint64_t shift = 64 - __builtin_clzll(divisor);
-#else
-	uint64_t shift = 32;
-	for (uint64_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
-		--shift;
-#endif
-
 	const uint32_t literal_id = (ImulRcpLiteralsEnd - literalPos) / sizeof(uint64_t);
-
 	literalPos -= sizeof(uint64_t);
-	const uint64_t randomx_reciprocal = (q << shift) + ((r << shift) / divisor);
-	memcpy(code + literalPos, &randomx_reciprocal, sizeof(randomx_reciprocal));
+
+	const uint64_t reciprocal = randomx_reciprocal_fast(divisor);
+	memcpy(code + literalPos, &reciprocal, sizeof(reciprocal));
 
 	if (literal_id < 12)
 	{
diff --git a/src/jit_compiler_rv64.cpp b/src/jit_compiler_rv64.cpp
index 301c294c..6f0842e5 100644
--- a/src/jit_compiler_rv64.cpp
+++ b/src/jit_compiler_rv64.cpp
@@ -776,7 +776,7 @@ namespace randomx {
 	}
 
 	static void v1_IMUL_RCP(HANDLER_ARGS) {
-		uint64_t divisor = isn.getImm32();
+		const uint32_t divisor = isn.getImm32();
 		if (!isZeroOrPowerOf2(divisor)) {
 			state.registerUsage[isn.dst] = i;
 			if (state.rcpCount < 4) {
diff --git a/src/jit_compiler_x86.cpp b/src/jit_compiler_x86.cpp
index 96c6492f..785ce5f5 100644
--- a/src/jit_compiler_x86.cpp
+++ b/src/jit_compiler_x86.cpp
@@ -618,7 +618,7 @@ namespace randomx {
 	}
 
 	void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) {
-		uint64_t divisor = instr.getImm32();
+		const uint32_t divisor = instr.getImm32();
 		if (!isZeroOrPowerOf2(divisor)) {
 			registerUsage[instr.dst] = i;
 			emit(MOV_RAX_I);
diff --git a/src/reciprocal.c b/src/reciprocal.c
index 22620f53..074d1846 100644
--- a/src/reciprocal.c
+++ b/src/reciprocal.c
@@ -44,36 +44,28 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ret
 
 */
-uint64_t randomx_reciprocal(uint64_t divisor) {
+uint64_t randomx_reciprocal(uint32_t divisor) {
 
 	assert(divisor != 0);
 
 	const uint64_t p2exp63 = 1ULL << 63;
+	const uint64_t q = p2exp63 / divisor;
+	const uint64_t r = p2exp63 % divisor;
+
+#ifdef __GNUC__
+	const uint32_t shift = 64 - __builtin_clzll(divisor);
+#else
+	uint32_t shift = 32;
+	for (uint32_t k = 1U << 31; (k & divisor) == 0; k >>= 1)
+		--shift;
+#endif
 
-	uint64_t quotient = p2exp63 / divisor, remainder = p2exp63 % divisor;
-
-	unsigned bsr = 0; //highest set bit in divisor
-
-	for (uint64_t bit = divisor; bit > 0; bit >>= 1)
-		bsr++;
-
-	for (unsigned shift = 0; shift < bsr; shift++) {
-		if (remainder >= divisor - remainder) {
-			quotient = quotient * 2 + 1;
-			remainder = remainder * 2 - divisor;
-		}
-		else {
-			quotient = quotient * 2;
-			remainder = remainder * 2;
-		}
-	}
-
-	return quotient;
+	return (q << shift) + ((r << shift) / divisor);
 }
 
 #if !RANDOMX_HAVE_FAST_RECIPROCAL
 
-uint64_t randomx_reciprocal_fast(uint64_t divisor) {
+uint64_t randomx_reciprocal_fast(uint32_t divisor) {
 	return randomx_reciprocal(divisor);
 }
 
diff --git a/src/reciprocal.h b/src/reciprocal.h
index 8858df2b..90bd9b6b 100644
--- a/src/reciprocal.h
+++ b/src/reciprocal.h
@@ -40,8 +40,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 extern "C" {
 #endif
 
-uint64_t randomx_reciprocal(uint64_t);
-uint64_t randomx_reciprocal_fast(uint64_t);
+uint64_t randomx_reciprocal(uint32_t);
+uint64_t randomx_reciprocal_fast(uint32_t);
 
 #if defined(__cplusplus)
 }
diff --git a/src/tests/perf-simulation.cpp b/src/tests/perf-simulation.cpp
index 1068a40e..27f34d8c 100644
--- a/src/tests/perf-simulation.cpp
+++ b/src/tests/perf-simulation.cpp
@@ -477,7 +477,7 @@ int analyze(randomx::Program& p) {
 		}
 
 		if (opcode < randomx::ceil_IMUL_RCP) {
-			uint64_t divisor = instr.getImm32();
+			const uint32_t divisor = instr.getImm32();
 			if (!randomx::isZeroOrPowerOf2(divisor)) {
 				instr.dst = instr.dst % randomx::RegistersCount;
 				instr.opcode |= DST_INT;

From 06a7cc1c3346609ebce92f91811c2974df08f474 Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 20 Oct 2023 16:16:51 +0200
Subject: [PATCH 11/13] Update README and benchmark version

---
 README.md               | 5 +++--
 src/tests/benchmark.cpp | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4c1dabb6..2c9bdd31 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ RandomX is written in C++11 and builds a static library with a C API provided by
 
 ### Linux
 
-Build dependencies: `cmake` (minimum 2.8.7) and `gcc` (minimum version 4.8, but version 7+ is recommended).
+Build dependencies: `cmake` (minimum 3.5) and `gcc` (minimum version 4.8, but version 7+ is recommended).
 
 To build optimized binaries for your machine, run:
 ```
@@ -82,7 +82,7 @@ Intel Core i7-8550U|16G DDR4-2400|Windows 10|hw|200 (4T)|1700  (4T)|350 (8T)|
 Intel Core i3-3220|4G DDR3-1333|Ubuntu 16.04|soft|42 (4T)|510 (4T)|150 (4T)|
 Raspberry Pi 3|1G LPDDR2|Ubuntu 16.04|soft|3.5 (4T)|-|20 (4T)|
 
-Note that RandomX currently includes a JIT compiler for x86-64 and ARM64. Other architectures have to use the portable interpreter, which is much slower.
+Note that RandomX currently includes a JIT compiler for x86-64, ARM64 and RISCV64. Other architectures have to use the portable interpreter, which is much slower.
 
 ### GPU performance
 
@@ -129,6 +129,7 @@ The reference implementation has been validated on the following platforms:
 * ARMv7+VFPv3 (32-bit, little-endian)
 * ARMv8 (64-bit, little-endian)
 * PPC64 (64-bit, big-endian)
+* RISCV64 (64-bit, little-endian)
 
 ### Can FPGAs mine RandomX?
 
diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index d25d0c2c..627b0d42 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -180,7 +180,7 @@ int main(int argc, char** argv) {
 
 	store32(&seed, seedValue);
 
-	std::cout << "RandomX benchmark v1.1.12" << std::endl;
+	std::cout << "RandomX benchmark v1.2.0" << std::endl;
 
 	if (help) {
 		printUsage(argv[0]);

From e895d451a3dff110dc10e378d31e3ea507a9006c Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 20 Oct 2023 19:29:35 +0200
Subject: [PATCH 12/13] Avoid `zext.b`

---
 src/jit_compiler_rv64_static.S | 64 +++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/src/jit_compiler_rv64_static.S b/src/jit_compiler_rv64_static.S
index 5ecb4815..240bbf5f 100644
--- a/src/jit_compiler_rv64_static.S
+++ b/src/jit_compiler_rv64_static.S
@@ -745,7 +745,7 @@ softaes_enc:
 #endif
 
     /* byte 0 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
     addi x12, x13, -2048
 #ifdef __riscv_zba
@@ -757,7 +757,7 @@ softaes_enc:
     lwu x14, -2048(x14)
 
     /* byte 1 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -769,7 +769,7 @@ softaes_enc:
     xor x8, x8, x14
 
     /* byte 2 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -781,7 +781,7 @@ softaes_enc:
     xor x11, x11, x15
 
     /* byte 3 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x13
@@ -793,7 +793,7 @@ softaes_enc:
     xor x10, x10, x14
 
     /* byte 4 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -805,7 +805,7 @@ softaes_enc:
     xor x9, x9, x15
 
     /* byte 5 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -817,7 +817,7 @@ softaes_enc:
     xor x9, x9, x14
 
     /* byte 6 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -829,7 +829,7 @@ softaes_enc:
     xor x8, x8, x15
 
     /* byte 7 */
-    zext.b x15, x30
+    andi x15, x30, 255
 #ifdef __riscv_zba
     sh2add x15, x15, x13
 #else
@@ -840,7 +840,7 @@ softaes_enc:
     xor x11, x11, x14
 
     /* byte 8 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -852,7 +852,7 @@ softaes_enc:
     xor x10, x10, x15
 
     /* byte 9 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -864,7 +864,7 @@ softaes_enc:
     xor x10, x10, x14
 
     /* byte 10 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -876,7 +876,7 @@ softaes_enc:
     xor x9, x9, x15
 
     /* byte 11 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x13
@@ -888,7 +888,7 @@ softaes_enc:
     xor x8, x8, x14
 
     /* byte 12 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -900,7 +900,7 @@ softaes_enc:
     xor x11, x11, x15
 
     /* byte 13 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -912,7 +912,7 @@ softaes_enc:
     xor x11, x11, x14
 
     /* byte 14 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -924,7 +924,7 @@ softaes_enc:
     xor x10, x10, x15
 
     /* byte 15 */
-    zext.b x15, x31
+    andi x15, x31, 255
 #ifdef __riscv_zba
     sh2add x15, x15, x13
 #else
@@ -960,7 +960,7 @@ softaes_dec:
 #endif
 
     /* byte 0 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
     addi x12, x13, -2048
 #ifdef __riscv_zba
@@ -972,7 +972,7 @@ softaes_dec:
     lwu x14, -2048(x14)
 
     /* byte 1 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -984,7 +984,7 @@ softaes_dec:
     xor x8, x8, x14
 
     /* byte 2 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -996,7 +996,7 @@ softaes_dec:
     xor x9, x9, x15
 
     /* byte 3 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x13
@@ -1008,7 +1008,7 @@ softaes_dec:
     xor x10, x10, x14
 
     /* byte 4 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -1020,7 +1020,7 @@ softaes_dec:
     xor x11, x11, x15
 
     /* byte 5 */
-    zext.b x15, x30
+    andi x15, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -1032,7 +1032,7 @@ softaes_dec:
     xor x9, x9, x14
 
     /* byte 6 */
-    zext.b x14, x30
+    andi x14, x30, 255
     srli x30, x30, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -1044,7 +1044,7 @@ softaes_dec:
     xor x10, x10, x15
 
     /* byte 7 */
-    zext.b x15, x30
+    andi x15, x30, 255
 #ifdef __riscv_zba
     sh2add x15, x15, x13
 #else
@@ -1055,7 +1055,7 @@ softaes_dec:
     xor x11, x11, x14
 
     /* byte 8 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -1067,7 +1067,7 @@ softaes_dec:
     xor x8, x8, x15
 
     /* byte 9 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -1079,7 +1079,7 @@ softaes_dec:
     xor x10, x10, x14
 
     /* byte 10 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -1091,7 +1091,7 @@ softaes_dec:
     xor x11, x11, x15
 
     /* byte 11 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x13
@@ -1103,7 +1103,7 @@ softaes_dec:
     xor x8, x8, x14
 
     /* byte 12 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x12
@@ -1115,7 +1115,7 @@ softaes_dec:
     xor x9, x9, x15
 
     /* byte 13 */
-    zext.b x15, x31
+    andi x15, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x15, x15, x12
@@ -1127,7 +1127,7 @@ softaes_dec:
     xor x11, x11, x14
 
     /* byte 14 */
-    zext.b x14, x31
+    andi x14, x31, 255
     srli x31, x31, 8
 #ifdef __riscv_zba
     sh2add x14, x14, x13
@@ -1139,7 +1139,7 @@ softaes_dec:
     xor x8, x8, x15
 
     /* byte 15 */
-    zext.b x15, x31
+    andi x15, x31, 255
 #ifdef __riscv_zba
     sh2add x15, x15, x13
 #else

From 102f8acf90a7649ada410de5499a7ec62e49e1da Mon Sep 17 00:00:00 2001
From: tevador <tevador@gmail.com>
Date: Fri, 20 Oct 2023 20:42:55 +0200
Subject: [PATCH 13/13] bump benchmark version to 1.2.1

---
 src/tests/benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/benchmark.cpp b/src/tests/benchmark.cpp
index 627b0d42..148521a5 100644
--- a/src/tests/benchmark.cpp
+++ b/src/tests/benchmark.cpp
@@ -180,7 +180,7 @@ int main(int argc, char** argv) {
 
 	store32(&seed, seedValue);
 
-	std::cout << "RandomX benchmark v1.2.0" << std::endl;
+	std::cout << "RandomX benchmark v1.2.1" << std::endl;
 
 	if (help) {
 		printUsage(argv[0]);