diff --git a/.ci/linux-x64-cpu-gcc.yml b/.ci/linux-x64-cpu-gcc.yml
index 4f138d9d080..f0bf4ce1ae1 100644
--- a/.ci/linux-x64-cpu-gcc.yml
+++ b/.ci/linux-x64-cpu-gcc.yml
@@ -117,3 +117,11 @@ jobs:
         cmake --build . -j $(nproc)
     - name: test-simplestl-simpleomp
       run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc)
+    - name: build-simplestl-simplemath
+      run: |
+        mkdir build-simplestl-simplemath && cd build-simplestl-simplemath
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test-simplestl-simplemath
+      run: cd build-simplestl-simplemath && ctest --output-on-failure -j $(nproc)
+
diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
index a693f415883..1c5e72edc7c 100644
--- a/.ci/test-coverage.yml
+++ b/.ci/test-coverage.yml
@@ -908,3 +908,47 @@ jobs:
         lcov --list lcov.info
     - name: codecov
       run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
+
+  linux-gcc-x64-simplemath:
+    name: linux-gcc-x64-simplemath
+
+    runs-on:
+      pool-name: docker
+      container:
+        image: bkci/ci:ubuntu
+    steps:
+    - name: checkout
+      checkout: self
+      with:
+        strategy: FRESH_CHECKOUT
+        enableSubmodule: false
+        enableGitLfs: false
+
+    - name: install-deps
+      run: |
+        apt-get update
+        apt-get install -y lcov
+        curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM
+        curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig
+        gpgv codecov.SHA256SUM.sig codecov.SHA256SUM
+        shasum -a 256 -c codecov.SHA256SUM
+        chmod +x codecov
+
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j $(nproc)
+    - name: test
+      run: cd build && ctest --output-on-failure -j $(nproc)
+    - name: lcov-collect
+      run: |
+        cd build
+        lcov -d ./src -c -o lcov.info
+        lcov -r lcov.info '/usr/*' -o lcov.info
+        lcov -r lcov.info '*/build/*' -o lcov.info
+        lcov --list lcov.info
+    - name: codecov
+      run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info
\ No newline at end of file
diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml
index 46179097aec..a791da6c26a 100644
--- a/.github/workflows/linux-aarch64-cpu-gcc.yml
+++ b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -86,6 +86,17 @@ jobs:
         export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
         cd build-noint8
         TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
+    
+    - name: build-simplestl-simplemath
+      run: |
+        mkdir build-simplestl-simplemath && cd build-simplestl-simplemath 
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu-c.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . -j 2
+    - name: test-simplestl-simplemath
+      run: |
+        export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH
+        cd build-simplestl-simplemath
+        TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2
 
   linux-gcc-arm82:
     runs-on: ubuntu-20.04
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 35a586ecda2..b6907207444 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,7 @@ option(NCNN_INSTALL_SDK "install ncnn library and headers" ON)
 option(NCNN_SIMPLEOCV "minimal opencv structure emulation" OFF)
 option(NCNN_SIMPLEOMP "minimal openmp runtime emulation" OFF)
 option(NCNN_SIMPLESTL "minimal cpp stl structure emulation" OFF)
+option(NCNN_SIMPLEMATH "minimal cmath" OFF)
 option(NCNN_THREADS "build with threads" ON)
 option(NCNN_BENCHMARK "print benchmark information for every layer" OFF)
 option(NCNN_C_API "build with C api" ON)
diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index 3155396fbaa..df2e8d37b94 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -25,7 +25,10 @@
 #include "datareader.h"
 #include "net.h"
 #include "gpu.h"
+
+#ifndef NCNN_SIMPLESTL
 #include <vector>
+#endif
 
 class DataReaderFromEmpty : public ncnn::DataReader
 {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 09f1b8ff48d..48154614d0a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -39,6 +39,7 @@ set(ncnn_SRCS
     simpleocv.cpp
     simpleomp.cpp
     simplestl.cpp
+    simplemath.cpp
 )
 
 if(ANDROID)
@@ -207,7 +208,7 @@ if(NOT NCNN_SHARED_LIB)
     set_target_properties(ncnn PROPERTIES COMPILE_FLAGS -DNCNN_STATIC_DEFINE)
 endif()
 
-if(NCNN_SIMPLESTL)
+if(NCNN_SIMPLESTL AND NOT NCNN_SIMPLEMATH)
     # link math lib explicitly
     target_link_libraries(ncnn PUBLIC m)
 endif()
@@ -260,7 +261,6 @@ if(NCNN_THREADS)
     if(TARGET Threads::Threads)
         target_link_libraries(ncnn PUBLIC Threads::Threads)
     endif()
-
     if(NCNN_SIMPLEOMP OR NCNN_SIMPLESTL)
         target_link_libraries(ncnn PUBLIC pthread)
     endif()
@@ -580,6 +580,7 @@ if(NCNN_INSTALL_SDK)
         simpleocv.h
         simpleomp.h
         simplestl.h
+        simplemath.h
         vulkan_header_fix.h
         ${CMAKE_CURRENT_BINARY_DIR}/ncnn_export.h
         ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h
@@ -598,5 +599,4 @@ endif()
 # add ncnn and generate-spirv to a virtual project group
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 set_property(TARGET ncnn PROPERTY FOLDER "libncnn")
-set_property(TARGET ncnn-generate-spirv PROPERTY FOLDER "libncnn")
-
+set_property(TARGET ncnn-generate-spirv PROPERTY FOLDER "libncnn")
\ No newline at end of file
diff --git a/src/gpu.cpp b/src/gpu.cpp
index f32f6e20a67..72ca65bc620 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -16,7 +16,6 @@
 
 #if NCNN_VULKAN
 
-#include <math.h>
 #include <string.h>
 #include <vulkan/vulkan.h>
 
diff --git a/src/layer.cpp b/src/layer.cpp
index a4f73a5c082..562576a5493 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -16,7 +16,6 @@
 
 #include "cpu.h"
 
-#include <math.h>
 #include <string.h>
 
 #ifdef _MSC_VER
diff --git a/src/layer.h b/src/layer.h
index ae4a8430d84..f0418a9ffcd 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -21,8 +21,6 @@
 #include "paramdict.h"
 #include "platform.h"
 
-#include <math.h>
-
 #if NCNN_VULKAN
 #include "command.h"
 #include "pipeline.h"
diff --git a/src/layer/arm/binaryop_arm.cpp b/src/layer/arm/binaryop_arm.cpp
index 25bfeb55557..55fb165911e 100644
--- a/src/layer/arm/binaryop_arm.cpp
+++ b/src/layer/arm/binaryop_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "binaryop_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/binaryop_arm_asimdhp.cpp b/src/layer/arm/binaryop_arm_asimdhp.cpp
index 9d4e9b94f7c..b9a8ea2d00b 100644
--- a/src/layer/arm/binaryop_arm_asimdhp.cpp
+++ b/src/layer/arm/binaryop_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "binaryop_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/cast_arm_bf16.cpp b/src/layer/arm/cast_arm_bf16.cpp
index aaaec09f968..358b9a9d2af 100644
--- a/src/layer/arm/cast_arm_bf16.cpp
+++ b/src/layer/arm/cast_arm_bf16.cpp
@@ -14,7 +14,7 @@
 
 #include "cpu.h"
 #include "mat.h"
-#include <math.h>
+
 namespace ncnn {
 
 #include "cast_bf16.h"
diff --git a/src/layer/arm/gelu_arm.cpp b/src/layer/arm/gelu_arm.cpp
index 3ae329a3a28..80d4efba0cb 100644
--- a/src/layer/arm/gelu_arm.cpp
+++ b/src/layer/arm/gelu_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "gelu_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/gelu_arm_asimdhp.cpp b/src/layer/arm/gelu_arm_asimdhp.cpp
index 78514dbc042..ea8b159cfa8 100644
--- a/src/layer/arm/gelu_arm_asimdhp.cpp
+++ b/src/layer/arm/gelu_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "gelu_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/gru_arm.cpp b/src/layer/arm/gru_arm.cpp
index aa927d26a58..70df351a555 100644
--- a/src/layer/arm/gru_arm.cpp
+++ b/src/layer/arm/gru_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "gru_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/gru_arm_asimdhp.cpp b/src/layer/arm/gru_arm_asimdhp.cpp
index f5e74b50284..ae657fc301b 100644
--- a/src/layer/arm/gru_arm_asimdhp.cpp
+++ b/src/layer/arm/gru_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "gru_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/innerproduct_arm.h b/src/layer/arm/innerproduct_arm.h
index 1eff44c7b1d..f1eee178f9c 100644
--- a/src/layer/arm/innerproduct_arm.h
+++ b/src/layer/arm/innerproduct_arm.h
@@ -16,8 +16,6 @@
 #define LAYER_INNERPRODUCT_ARM_H
 
 #include "innerproduct.h"
-#include <cmath>
-#include <cstdlib>
 
 namespace ncnn {
 
diff --git a/src/layer/arm/interp_arm.cpp b/src/layer/arm/interp_arm.cpp
index 1ee97d57996..191499aa26b 100644
--- a/src/layer/arm/interp_arm.cpp
+++ b/src/layer/arm/interp_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/interp_arm_asimdhp.cpp b/src/layer/arm/interp_arm_asimdhp.cpp
index c9bf14b1077..286c74fe40c 100644
--- a/src/layer/arm/interp_arm_asimdhp.cpp
+++ b/src/layer/arm/interp_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/lrn_arm.cpp b/src/layer/arm/lrn_arm.cpp
index fdc05c3f952..f763bfb2a2f 100644
--- a/src/layer/arm/lrn_arm.cpp
+++ b/src/layer/arm/lrn_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "lrn_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp
index 79a0c97c917..04d7277547e 100644
--- a/src/layer/arm/lstm_arm.cpp
+++ b/src/layer/arm/lstm_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "lstm_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/lstm_arm_asimdhp.cpp b/src/layer/arm/lstm_arm_asimdhp.cpp
index a394bad4c2e..8a3ee63e40a 100644
--- a/src/layer/arm/lstm_arm_asimdhp.cpp
+++ b/src/layer/arm/lstm_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "lstm_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/mish_arm.cpp b/src/layer/arm/mish_arm.cpp
index 54757380d0c..31c9f77df63 100644
--- a/src/layer/arm/mish_arm.cpp
+++ b/src/layer/arm/mish_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "mish_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/mish_arm_asimdhp.cpp b/src/layer/arm/mish_arm_asimdhp.cpp
index e8db14d3e41..0e04883370e 100644
--- a/src/layer/arm/mish_arm_asimdhp.cpp
+++ b/src/layer/arm/mish_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "mish_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp
index aa2a61a3472..6e395a9bb76 100644
--- a/src/layer/arm/quantize_arm.cpp
+++ b/src/layer/arm/quantize_arm.cpp
@@ -15,8 +15,6 @@
 
 #include "quantize_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/quantize_arm_asimdhp.cpp b/src/layer/arm/quantize_arm_asimdhp.cpp
index d3a66271654..faccb907b41 100644
--- a/src/layer/arm/quantize_arm_asimdhp.cpp
+++ b/src/layer/arm/quantize_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "quantize_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/requantize_arm.cpp b/src/layer/arm/requantize_arm.cpp
index 4d4531e9438..32fdd961433 100644
--- a/src/layer/arm/requantize_arm.cpp
+++ b/src/layer/arm/requantize_arm.cpp
@@ -15,8 +15,6 @@
 
 #include "requantize_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/rnn_arm.cpp b/src/layer/arm/rnn_arm.cpp
index 87892d7ada2..19f439ea2d5 100644
--- a/src/layer/arm/rnn_arm.cpp
+++ b/src/layer/arm/rnn_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "rnn_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/rnn_arm_asimdhp.cpp b/src/layer/arm/rnn_arm_asimdhp.cpp
index 79fb0b1db1e..c34b3e8bb48 100644
--- a/src/layer/arm/rnn_arm_asimdhp.cpp
+++ b/src/layer/arm/rnn_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "rnn_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/layer/arm/sigmoid_arm.cpp b/src/layer/arm/sigmoid_arm.cpp
index fb79c4d56c1..af2b396dd5e 100644
--- a/src/layer/arm/sigmoid_arm.cpp
+++ b/src/layer/arm/sigmoid_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "sigmoid_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/sigmoid_arm_asimdhp.cpp b/src/layer/arm/sigmoid_arm_asimdhp.cpp
index 3e5e6cd830d..65c32ee3e67 100644
--- a/src/layer/arm/sigmoid_arm_asimdhp.cpp
+++ b/src/layer/arm/sigmoid_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "sigmoid_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp
index 81907555469..48faaf91061 100644
--- a/src/layer/arm/softmax_arm.cpp
+++ b/src/layer/arm/softmax_arm.cpp
@@ -15,7 +15,6 @@
 #include "softmax_arm.h"
 
 #include <float.h>
-#include <math.h>
 
 #if __ARM_NEON
 #include <arm_neon.h>
diff --git a/src/layer/arm/softmax_arm_asimdhp.cpp b/src/layer/arm/softmax_arm_asimdhp.cpp
index 2460a92f435..d8efaf4c3b9 100644
--- a/src/layer/arm/softmax_arm_asimdhp.cpp
+++ b/src/layer/arm/softmax_arm_asimdhp.cpp
@@ -15,7 +15,6 @@
 #include "softmax_arm.h"
 
 #include <float.h>
-#include <math.h>
 
 #if __ARM_NEON
 #include <arm_neon.h>
diff --git a/src/layer/arm/swish_arm.cpp b/src/layer/arm/swish_arm.cpp
index 8b2ff9a01e5..d68e617276c 100644
--- a/src/layer/arm/swish_arm.cpp
+++ b/src/layer/arm/swish_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "swish_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/swish_arm_asimdhp.cpp b/src/layer/arm/swish_arm_asimdhp.cpp
index 5a598f67501..4aee8a898c4 100644
--- a/src/layer/arm/swish_arm_asimdhp.cpp
+++ b/src/layer/arm/swish_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "swish_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/tanh_arm.cpp b/src/layer/arm/tanh_arm.cpp
index 0b9dd5c95e8..6e86d7ad300 100644
--- a/src/layer/arm/tanh_arm.cpp
+++ b/src/layer/arm/tanh_arm.cpp
@@ -14,8 +14,6 @@
 
 #include "tanh_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/tanh_arm_asimdhp.cpp b/src/layer/arm/tanh_arm_asimdhp.cpp
index e9297aa71a7..10f3303a1ce 100644
--- a/src/layer/arm/tanh_arm_asimdhp.cpp
+++ b/src/layer/arm/tanh_arm_asimdhp.cpp
@@ -14,8 +14,6 @@
 
 #include "tanh_arm.h"
 
-#include <math.h>
-
 #if __ARM_NEON
 #include <arm_neon.h>
 #include "neon_mathfun.h"
diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp
index 5a054cc7c4d..e2dbd68c3a4 100644
--- a/src/layer/arm/unaryop_arm.cpp
+++ b/src/layer/arm/unaryop_arm.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop_arm.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 #if __ARM_NEON
 #include <arm_neon.h>
diff --git a/src/layer/arm/unaryop_arm_asimdhp.cpp b/src/layer/arm/unaryop_arm_asimdhp.cpp
index 02532db4114..ac64fc708f9 100644
--- a/src/layer/arm/unaryop_arm_asimdhp.cpp
+++ b/src/layer/arm/unaryop_arm_asimdhp.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop_arm.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 #if __ARM_NEON
 #include <arm_neon.h>
diff --git a/src/layer/batchnorm.cpp b/src/layer/batchnorm.cpp
index cf0f871e58f..b13e5ef2966 100644
--- a/src/layer/batchnorm.cpp
+++ b/src/layer/batchnorm.cpp
@@ -14,8 +14,6 @@
 
 #include "batchnorm.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 BatchNorm::BatchNorm()
diff --git a/src/layer/binaryop.cpp b/src/layer/binaryop.cpp
index 0ffaf80e391..52d3d083b31 100644
--- a/src/layer/binaryop.cpp
+++ b/src/layer/binaryop.cpp
@@ -14,8 +14,6 @@
 
 #include "binaryop.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 BinaryOp::BinaryOp()
diff --git a/src/layer/bnll.cpp b/src/layer/bnll.cpp
index 72c2ab16170..9341ebcfcec 100644
--- a/src/layer/bnll.cpp
+++ b/src/layer/bnll.cpp
@@ -14,8 +14,6 @@
 
 #include "bnll.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 BNLL::BNLL()
diff --git a/src/layer/celu.cpp b/src/layer/celu.cpp
index 58782f877cb..8c17244c0eb 100644
--- a/src/layer/celu.cpp
+++ b/src/layer/celu.cpp
@@ -14,8 +14,6 @@
 
 #include "celu.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 CELU::CELU()
diff --git a/src/layer/detectionoutput.cpp b/src/layer/detectionoutput.cpp
index 266beaca75a..f90b904789b 100644
--- a/src/layer/detectionoutput.cpp
+++ b/src/layer/detectionoutput.cpp
@@ -14,8 +14,6 @@
 
 #include "detectionoutput.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 DetectionOutput::DetectionOutput()
diff --git a/src/layer/dropout.cpp b/src/layer/dropout.cpp
index f64f7ea3008..9e5ddaa17b5 100644
--- a/src/layer/dropout.cpp
+++ b/src/layer/dropout.cpp
@@ -14,8 +14,6 @@
 
 #include "dropout.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Dropout::Dropout()
diff --git a/src/layer/elu.cpp b/src/layer/elu.cpp
index b14c1131b0c..e710d4f1cc5 100644
--- a/src/layer/elu.cpp
+++ b/src/layer/elu.cpp
@@ -14,8 +14,6 @@
 
 #include "elu.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 ELU::ELU()
diff --git a/src/layer/erf.cpp b/src/layer/erf.cpp
index c5f56e835f0..8b455919ab2 100644
--- a/src/layer/erf.cpp
+++ b/src/layer/erf.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "erf.h"
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/exp.cpp b/src/layer/exp.cpp
index ea8bf7dbda7..83644a7934d 100644
--- a/src/layer/exp.cpp
+++ b/src/layer/exp.cpp
@@ -14,8 +14,6 @@
 
 #include "exp.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Exp::Exp()
diff --git a/src/layer/fused_activation.h b/src/layer/fused_activation.h
index a331a6df5da..275fd9e2f9a 100644
--- a/src/layer/fused_activation.h
+++ b/src/layer/fused_activation.h
@@ -15,7 +15,6 @@
 #ifndef FUSED_ACTIVATION_H
 #define FUSED_ACTIVATION_H
 
-#include <math.h>
 #include "mat.h"
 #include "layer_type.h"
 
diff --git a/src/layer/gelu.cpp b/src/layer/gelu.cpp
index 32b2b89954f..d1072653774 100644
--- a/src/layer/gelu.cpp
+++ b/src/layer/gelu.cpp
@@ -14,8 +14,6 @@
 
 #include "gelu.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 GELU::GELU()
diff --git a/src/layer/glu.cpp b/src/layer/glu.cpp
index 9555b88c645..8f8e057e9a4 100644
--- a/src/layer/glu.cpp
+++ b/src/layer/glu.cpp
@@ -14,8 +14,6 @@
 
 #include "glu.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 GLU::GLU()
diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp
index e8579cf4aef..abeec6fa5be 100644
--- a/src/layer/gridsample.cpp
+++ b/src/layer/gridsample.cpp
@@ -13,7 +13,6 @@
 // specific language governing permissions and limitations under the License.
 
 #include "gridsample.h"
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/groupnorm.cpp b/src/layer/groupnorm.cpp
index f07be96cb54..7d28024d5ab 100644
--- a/src/layer/groupnorm.cpp
+++ b/src/layer/groupnorm.cpp
@@ -14,8 +14,6 @@
 
 #include "groupnorm.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 GroupNorm::GroupNorm()
diff --git a/src/layer/gru.cpp b/src/layer/gru.cpp
index 1f7ddaef4ac..b1ef2e0da45 100644
--- a/src/layer/gru.cpp
+++ b/src/layer/gru.cpp
@@ -14,8 +14,6 @@
 
 #include "gru.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 GRU::GRU()
diff --git a/src/layer/instancenorm.cpp b/src/layer/instancenorm.cpp
index 259fd7b26e5..27dba6c2a6b 100644
--- a/src/layer/instancenorm.cpp
+++ b/src/layer/instancenorm.cpp
@@ -14,8 +14,6 @@
 
 #include "instancenorm.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 InstanceNorm::InstanceNorm()
diff --git a/src/layer/layernorm.cpp b/src/layer/layernorm.cpp
index d1361dec644..a4ff036fb15 100644
--- a/src/layer/layernorm.cpp
+++ b/src/layer/layernorm.cpp
@@ -14,8 +14,6 @@
 
 #include "layernorm.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 LayerNorm::LayerNorm()
diff --git a/src/layer/log.cpp b/src/layer/log.cpp
index 135cc4ebb38..422ebbb2207 100644
--- a/src/layer/log.cpp
+++ b/src/layer/log.cpp
@@ -14,8 +14,6 @@
 
 #include "log.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Log::Log()
diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp
index 0250226dc60..33916d966aa 100644
--- a/src/layer/loongarch/binaryop_loongarch.cpp
+++ b/src/layer/loongarch/binaryop_loongarch.cpp
@@ -14,8 +14,6 @@
 
 #include "binaryop_loongarch.h"
 
-#include <math.h>
-
 #if __loongarch_sx
 #include <lsxintrin.h>
 #include "lsx_mathfun.h"
diff --git a/src/layer/loongarch/interp_loongarch.cpp b/src/layer/loongarch/interp_loongarch.cpp
index 94d25cf005e..7c47c108859 100644
--- a/src/layer/loongarch/interp_loongarch.cpp
+++ b/src/layer/loongarch/interp_loongarch.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_loongarch.h"
 
-#include <math.h>
-
 #if __loongarch_sx
 #include <lsxintrin.h>
 #endif // __loongarch_sx
diff --git a/src/layer/loongarch/loongarch_usability.h b/src/layer/loongarch/loongarch_usability.h
index d3ae5dec279..0cd82e8fb45 100644
--- a/src/layer/loongarch/loongarch_usability.h
+++ b/src/layer/loongarch/loongarch_usability.h
@@ -19,7 +19,6 @@
 #include <lsxintrin.h>
 #endif // __loongarch_sx
 
-#include <math.h>
 #include <stdint.h>
 
 namespace ncnn {
diff --git a/src/layer/loongarch/mish_loongarch.cpp b/src/layer/loongarch/mish_loongarch.cpp
index 8558e2f8cb0..90e5ffe5484 100644
--- a/src/layer/loongarch/mish_loongarch.cpp
+++ b/src/layer/loongarch/mish_loongarch.cpp
@@ -19,8 +19,6 @@
 #include "lsx_mathfun.h"
 #endif // __loongarch_sx
 
-#include <math.h>
-
 namespace ncnn {
 
 Mish_loongarch::Mish_loongarch()
diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp
index 657ff2d06bf..a0dd618771d 100644
--- a/src/layer/loongarch/quantize_loongarch.cpp
+++ b/src/layer/loongarch/quantize_loongarch.cpp
@@ -14,8 +14,6 @@
 
 #include "quantize_loongarch.h"
 
-#include <math.h>
-
 #if __loongarch_sx
 #include <lsxintrin.h>
 #endif // __loongarch_sx
diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp
index 556d20de4f6..3399ac096b6 100644
--- a/src/layer/loongarch/requantize_loongarch.cpp
+++ b/src/layer/loongarch/requantize_loongarch.cpp
@@ -14,8 +14,6 @@
 
 #include "requantize_loongarch.h"
 
-#include <math.h>
-
 #if __loongarch_sx
 #include <lsxintrin.h>
 #endif // __loongarch_sx
diff --git a/src/layer/loongarch/sigmoid_loongarch.cpp b/src/layer/loongarch/sigmoid_loongarch.cpp
index 6d112804f26..c6f83c24708 100644
--- a/src/layer/loongarch/sigmoid_loongarch.cpp
+++ b/src/layer/loongarch/sigmoid_loongarch.cpp
@@ -21,8 +21,6 @@
 
 #include "loongarch_usability.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Sigmoid_loongarch::Sigmoid_loongarch()
diff --git a/src/layer/loongarch/softmax_loongarch.cpp b/src/layer/loongarch/softmax_loongarch.cpp
index 88b49559754..513f9a5e9ca 100644
--- a/src/layer/loongarch/softmax_loongarch.cpp
+++ b/src/layer/loongarch/softmax_loongarch.cpp
@@ -15,7 +15,6 @@
 #include "softmax_loongarch.h"
 
 #include <float.h>
-#include <math.h>
 
 #if __loongarch_sx
 #include <lsxintrin.h>
diff --git a/src/layer/loongarch/swish_loongarch.cpp b/src/layer/loongarch/swish_loongarch.cpp
index 9c9005de6fc..7e80339c937 100644
--- a/src/layer/loongarch/swish_loongarch.cpp
+++ b/src/layer/loongarch/swish_loongarch.cpp
@@ -19,8 +19,6 @@
 #include "lsx_mathfun.h"
 #endif // __loongarch_sx
 
-#include <math.h>
-
 namespace ncnn {
 
 Swish_loongarch::Swish_loongarch()
diff --git a/src/layer/loongarch/tanh_loongarch.cpp b/src/layer/loongarch/tanh_loongarch.cpp
index 13227fa71e3..b592c3f57b2 100644
--- a/src/layer/loongarch/tanh_loongarch.cpp
+++ b/src/layer/loongarch/tanh_loongarch.cpp
@@ -19,8 +19,6 @@
 #include "lsx_mathfun.h"
 #endif // __loongarch_sx
 
-#include <math.h>
-
 namespace ncnn {
 
 TanH_loongarch::TanH_loongarch()
diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp
index 4d4818cb5af..95a4e9984b6 100644
--- a/src/layer/loongarch/unaryop_loongarch.cpp
+++ b/src/layer/loongarch/unaryop_loongarch.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop_loongarch.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 #if __loongarch_sx
 #include <lsxintrin.h>
diff --git a/src/layer/lrn.cpp b/src/layer/lrn.cpp
index aaa8855135a..c18f1def9fb 100644
--- a/src/layer/lrn.cpp
+++ b/src/layer/lrn.cpp
@@ -14,8 +14,6 @@
 
 #include "lrn.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 LRN::LRN()
diff --git a/src/layer/lstm.cpp b/src/layer/lstm.cpp
index f2aa19f25ab..c761a98d4dd 100644
--- a/src/layer/lstm.cpp
+++ b/src/layer/lstm.cpp
@@ -14,8 +14,6 @@
 
 #include "lstm.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 LSTM::LSTM()
diff --git a/src/layer/mips/binaryop_mips.cpp b/src/layer/mips/binaryop_mips.cpp
index ab8bfe86ac3..188a0860508 100644
--- a/src/layer/mips/binaryop_mips.cpp
+++ b/src/layer/mips/binaryop_mips.cpp
@@ -14,8 +14,6 @@
 
 #include "binaryop_mips.h"
 
-#include <math.h>
-
 #if __mips_msa
 #include <msa.h>
 #include "msa_mathfun.h"
diff --git a/src/layer/mips/interp_mips.cpp b/src/layer/mips/interp_mips.cpp
index 7d77e9b9dbf..2cc3202e915 100644
--- a/src/layer/mips/interp_mips.cpp
+++ b/src/layer/mips/interp_mips.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_mips.h"
 
-#include <math.h>
-
 #if __mips_msa
 #include <msa.h>
 #endif // __mips_msa
diff --git a/src/layer/mips/mips_usability.h b/src/layer/mips/mips_usability.h
index 4aee94e75a9..662320ee747 100644
--- a/src/layer/mips/mips_usability.h
+++ b/src/layer/mips/mips_usability.h
@@ -20,7 +20,6 @@
 #include <msa.h>
 #endif // __mips_msa
 
-#include <math.h>
 #include <stdint.h>
 
 namespace ncnn {
diff --git a/src/layer/mips/mish_mips.cpp b/src/layer/mips/mish_mips.cpp
index 3dc81450914..32f8a6e173c 100644
--- a/src/layer/mips/mish_mips.cpp
+++ b/src/layer/mips/mish_mips.cpp
@@ -19,8 +19,6 @@
 #include "msa_mathfun.h"
 #endif // __mips_msa
 
-#include <math.h>
-
 namespace ncnn {
 
 Mish_mips::Mish_mips()
diff --git a/src/layer/mips/quantize_mips.cpp b/src/layer/mips/quantize_mips.cpp
index a4b61601661..963d0908ce4 100644
--- a/src/layer/mips/quantize_mips.cpp
+++ b/src/layer/mips/quantize_mips.cpp
@@ -14,8 +14,6 @@
 
 #include "quantize_mips.h"
 
-#include <math.h>
-
 #if __mips_msa
 #include <msa.h>
 #endif // __mips_msa
diff --git a/src/layer/mips/requantize_mips.cpp b/src/layer/mips/requantize_mips.cpp
index 095f42084c9..44e55f89477 100644
--- a/src/layer/mips/requantize_mips.cpp
+++ b/src/layer/mips/requantize_mips.cpp
@@ -14,8 +14,6 @@
 
 #include "requantize_mips.h"
 
-#include <math.h>
-
 #if __mips_msa
 #include <msa.h>
 #endif // __mips_msa
diff --git a/src/layer/mips/sigmoid_mips.cpp b/src/layer/mips/sigmoid_mips.cpp
index af44f811364..b7f83f37bb2 100644
--- a/src/layer/mips/sigmoid_mips.cpp
+++ b/src/layer/mips/sigmoid_mips.cpp
@@ -21,8 +21,6 @@
 
 #include "mips_usability.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Sigmoid_mips::Sigmoid_mips()
diff --git a/src/layer/mips/softmax_mips.cpp b/src/layer/mips/softmax_mips.cpp
index ae35782da9f..f00b2849670 100644
--- a/src/layer/mips/softmax_mips.cpp
+++ b/src/layer/mips/softmax_mips.cpp
@@ -15,7 +15,6 @@
 #include "softmax_mips.h"
 
 #include <float.h>
-#include <math.h>
 
 #if __mips_msa
 #include <msa.h>
diff --git a/src/layer/mips/swish_mips.cpp b/src/layer/mips/swish_mips.cpp
index d3a7d032b55..6c6a368301d 100644
--- a/src/layer/mips/swish_mips.cpp
+++ b/src/layer/mips/swish_mips.cpp
@@ -19,8 +19,6 @@
 #include "msa_mathfun.h"
 #endif // __mips_msa
 
-#include <math.h>
-
 namespace ncnn {
 
 Swish_mips::Swish_mips()
diff --git a/src/layer/mips/tanh_mips.cpp b/src/layer/mips/tanh_mips.cpp
index c2197fb75d9..4546a98de63 100644
--- a/src/layer/mips/tanh_mips.cpp
+++ b/src/layer/mips/tanh_mips.cpp
@@ -19,8 +19,6 @@
 #include "msa_mathfun.h"
 #endif // __mips_msa
 
-#include <math.h>
-
 namespace ncnn {
 
 TanH_mips::TanH_mips()
diff --git a/src/layer/mips/unaryop_mips.cpp b/src/layer/mips/unaryop_mips.cpp
index b923535a2d8..cb3c115cd00 100644
--- a/src/layer/mips/unaryop_mips.cpp
+++ b/src/layer/mips/unaryop_mips.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop_mips.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 #if __mips_msa
 #include <msa.h>
diff --git a/src/layer/mish.cpp b/src/layer/mish.cpp
index 8b2f16500c7..f27d112f445 100644
--- a/src/layer/mish.cpp
+++ b/src/layer/mish.cpp
@@ -14,8 +14,6 @@
 
 #include "mish.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Mish::Mish()
diff --git a/src/layer/mvn.cpp b/src/layer/mvn.cpp
index 773ace23c50..713fb1b4195 100644
--- a/src/layer/mvn.cpp
+++ b/src/layer/mvn.cpp
@@ -14,8 +14,6 @@
 
 #include "mvn.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 MVN::MVN()
diff --git a/src/layer/normalize.cpp b/src/layer/normalize.cpp
index 2aa6109b187..a86851117c9 100644
--- a/src/layer/normalize.cpp
+++ b/src/layer/normalize.cpp
@@ -14,8 +14,6 @@
 
 #include "normalize.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Normalize::Normalize()
diff --git a/src/layer/power.cpp b/src/layer/power.cpp
index a25d23bfb63..8e4ef25852b 100644
--- a/src/layer/power.cpp
+++ b/src/layer/power.cpp
@@ -14,8 +14,6 @@
 
 #include "power.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Power::Power()
diff --git a/src/layer/priorbox.cpp b/src/layer/priorbox.cpp
index 82249a55f63..6e54ba0162d 100644
--- a/src/layer/priorbox.cpp
+++ b/src/layer/priorbox.cpp
@@ -14,8 +14,6 @@
 
 #include "priorbox.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 PriorBox::PriorBox()
diff --git a/src/layer/proposal.cpp b/src/layer/proposal.cpp
index 908b60692da..a7dce35f6ee 100644
--- a/src/layer/proposal.cpp
+++ b/src/layer/proposal.cpp
@@ -14,8 +14,6 @@
 
 #include "proposal.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Proposal::Proposal()
diff --git a/src/layer/psroipooling.cpp b/src/layer/psroipooling.cpp
index ebe2ad800c6..c576e31161c 100644
--- a/src/layer/psroipooling.cpp
+++ b/src/layer/psroipooling.cpp
@@ -14,8 +14,6 @@
 
 #include "psroipooling.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 PSROIPooling::PSROIPooling()
diff --git a/src/layer/quantize.cpp b/src/layer/quantize.cpp
index 54bfb836f52..a53cebdd9a0 100644
--- a/src/layer/quantize.cpp
+++ b/src/layer/quantize.cpp
@@ -14,8 +14,6 @@
 
 #include "quantize.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Quantize::Quantize()
diff --git a/src/layer/reduction.cpp b/src/layer/reduction.cpp
index f7c9013b8f4..4d4f7fb578b 100644
--- a/src/layer/reduction.cpp
+++ b/src/layer/reduction.cpp
@@ -16,7 +16,6 @@
 
 #include <float.h>
 #include <limits.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/requantize.cpp b/src/layer/requantize.cpp
index 0bcbbff879f..e11fbc6b272 100644
--- a/src/layer/requantize.cpp
+++ b/src/layer/requantize.cpp
@@ -15,8 +15,6 @@
 
 #include "requantize.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 static inline signed char float2int8(float v)
diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index c3d4258dd5e..da4593197f4 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -17,8 +17,6 @@
 
 #include "binaryop_riscv.h"
 
-#include <math.h>
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "rvv_mathfun.h"
diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp
index 95616866b8a..20cf5d94c7d 100644
--- a/src/layer/riscv/instancenorm_riscv.cpp
+++ b/src/layer/riscv/instancenorm_riscv.cpp
@@ -14,8 +14,6 @@
 
 #include "instancenorm_riscv.h"
 
-#include <math.h>
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #endif // __riscv_vector
diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp
index ea8344985ed..ac72cf9b63c 100644
--- a/src/layer/riscv/interp_riscv.cpp
+++ b/src/layer/riscv/interp_riscv.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_riscv.h"
 
-#include <math.h>
-
 #if __riscv_vector
 #include <riscv_vector.h>
 #include "riscv_usability.h"
diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp
index 4ddb1470006..57b17d3a732 100644
--- a/src/layer/riscv/mish_riscv.cpp
+++ b/src/layer/riscv/mish_riscv.cpp
@@ -20,8 +20,6 @@
 #include "rvv_mathfun_fp16s.h"
 #endif // __riscv_vector
 
-#include <math.h>
-
 namespace ncnn {
 
 Mish_riscv::Mish_riscv()
diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp
index 6c10582c668..14770f95e78 100644
--- a/src/layer/riscv/sigmoid_riscv.cpp
+++ b/src/layer/riscv/sigmoid_riscv.cpp
@@ -20,8 +20,6 @@
 #include "rvv_mathfun_fp16s.h"
 #endif // __riscv_vector
 
-#include <math.h>
-
 namespace ncnn {
 
 Sigmoid_riscv::Sigmoid_riscv()
diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp
index 17493d7db69..7e2e2488c42 100644
--- a/src/layer/riscv/swish_riscv.cpp
+++ b/src/layer/riscv/swish_riscv.cpp
@@ -20,8 +20,6 @@
 #include "rvv_mathfun_fp16s.h"
 #endif // __riscv_vector
 
-#include <math.h>
-
 namespace ncnn {
 
 Swish_riscv::Swish_riscv()
diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp
index d47de61dc59..0c147b15bd6 100644
--- a/src/layer/riscv/tanh_riscv.cpp
+++ b/src/layer/riscv/tanh_riscv.cpp
@@ -20,8 +20,6 @@
 #include "rvv_mathfun_fp16s.h"
 #endif // __riscv_vector
 
-#include <math.h>
-
 namespace ncnn {
 
 TanH_riscv::TanH_riscv()
diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp
index 4c7c2fabc7f..b6acf25e438 100644
--- a/src/layer/riscv/unaryop_riscv.cpp
+++ b/src/layer/riscv/unaryop_riscv.cpp
@@ -20,8 +20,6 @@
 #include "rvv_mathfun_fp16s.h"
 #endif // __riscv_vector
 
-#include <math.h>
-
 namespace ncnn {
 
 UnaryOp_riscv::UnaryOp_riscv()
diff --git a/src/layer/rnn.cpp b/src/layer/rnn.cpp
index d1856ce6fa9..6cc8ba5c9bd 100644
--- a/src/layer/rnn.cpp
+++ b/src/layer/rnn.cpp
@@ -14,8 +14,6 @@
 
 #include "rnn.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 RNN::RNN()
diff --git a/src/layer/roialign.cpp b/src/layer/roialign.cpp
index 3d1c14538ce..a344f67f79d 100644
--- a/src/layer/roialign.cpp
+++ b/src/layer/roialign.cpp
@@ -15,7 +15,6 @@
 #include "roialign.h"
 
 #include <assert.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/roipooling.cpp b/src/layer/roipooling.cpp
index 96b43d3850f..9fd843737a3 100644
--- a/src/layer/roipooling.cpp
+++ b/src/layer/roipooling.cpp
@@ -14,8 +14,6 @@
 
 #include "roipooling.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 ROIPooling::ROIPooling()
diff --git a/src/layer/selu.cpp b/src/layer/selu.cpp
index faa7e199825..42a4ff2a813 100644
--- a/src/layer/selu.cpp
+++ b/src/layer/selu.cpp
@@ -14,8 +14,6 @@
 
 #include "selu.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 SELU::SELU()
diff --git a/src/layer/sigmoid.cpp b/src/layer/sigmoid.cpp
index 963c0f98f5a..4ed0dab5e81 100644
--- a/src/layer/sigmoid.cpp
+++ b/src/layer/sigmoid.cpp
@@ -14,8 +14,6 @@
 
 #include "sigmoid.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Sigmoid::Sigmoid()
diff --git a/src/layer/softmax.cpp b/src/layer/softmax.cpp
index a948f07f354..2768a82c20f 100644
--- a/src/layer/softmax.cpp
+++ b/src/layer/softmax.cpp
@@ -15,7 +15,6 @@
 #include "softmax.h"
 
 #include <float.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/softplus.cpp b/src/layer/softplus.cpp
index 615496037c4..4910aad2949 100644
--- a/src/layer/softplus.cpp
+++ b/src/layer/softplus.cpp
@@ -14,8 +14,6 @@
 
 #include "softplus.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Softplus::Softplus()
diff --git a/src/layer/spp.cpp b/src/layer/spp.cpp
index a2678a32a8b..b7070955cb8 100644
--- a/src/layer/spp.cpp
+++ b/src/layer/spp.cpp
@@ -14,8 +14,6 @@
 
 #include "spp.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 SPP::SPP()
diff --git a/src/layer/statisticspooling.cpp b/src/layer/statisticspooling.cpp
index 1947b61c875..9ed6d22f417 100644
--- a/src/layer/statisticspooling.cpp
+++ b/src/layer/statisticspooling.cpp
@@ -14,7 +14,6 @@
 
 #include <float.h>
 #include <limits.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/swish.cpp b/src/layer/swish.cpp
index 3d8f3e3d65f..2816230c180 100644
--- a/src/layer/swish.cpp
+++ b/src/layer/swish.cpp
@@ -14,8 +14,6 @@
 
 #include "swish.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Swish::Swish()
diff --git a/src/layer/tanh.cpp b/src/layer/tanh.cpp
index a7d0249e1b9..c4b68352af6 100644
--- a/src/layer/tanh.cpp
+++ b/src/layer/tanh.cpp
@@ -14,8 +14,6 @@
 
 #include "tanh.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 TanH::TanH()
diff --git a/src/layer/unaryop.cpp b/src/layer/unaryop.cpp
index 2fe77717ed3..b05add15cfb 100644
--- a/src/layer/unaryop.cpp
+++ b/src/layer/unaryop.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/vulkan/binaryop_vulkan.cpp b/src/layer/vulkan/binaryop_vulkan.cpp
index 3c0ad7299b5..37c0bb79e51 100644
--- a/src/layer/vulkan/binaryop_vulkan.cpp
+++ b/src/layer/vulkan/binaryop_vulkan.cpp
@@ -16,8 +16,6 @@
 
 #include "layer_shader_type.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 BinaryOp_vulkan::BinaryOp_vulkan()
diff --git a/src/layer/vulkan/priorbox_vulkan.cpp b/src/layer/vulkan/priorbox_vulkan.cpp
index ba41fc96e59..5cfe341cd78 100644
--- a/src/layer/vulkan/priorbox_vulkan.cpp
+++ b/src/layer/vulkan/priorbox_vulkan.cpp
@@ -17,8 +17,6 @@
 #include "layer_shader_type.h"
 #include "platform.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 PriorBox_vulkan::PriorBox_vulkan()
diff --git a/src/layer/x86/binaryop_x86.cpp b/src/layer/x86/binaryop_x86.cpp
index d3f62e09d36..14ad9d5f638 100644
--- a/src/layer/x86/binaryop_x86.cpp
+++ b/src/layer/x86/binaryop_x86.cpp
@@ -26,8 +26,6 @@
 #endif // __AVX__
 #endif // __SSE2__
 
-#include <math.h>
-
 namespace ncnn {
 
 BinaryOp_x86::BinaryOp_x86()
diff --git a/src/layer/x86/bnll_x86.cpp b/src/layer/x86/bnll_x86.cpp
index e082d79fc48..e2eb995d095 100644
--- a/src/layer/x86/bnll_x86.cpp
+++ b/src/layer/x86/bnll_x86.cpp
@@ -25,7 +25,6 @@
 #endif // __AVX512F__
 #endif // __AVX__
 #endif // __SSE2__
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/x86/interp_x86.cpp b/src/layer/x86/interp_x86.cpp
index 193fbe99a2d..f08b6bb9aff 100644
--- a/src/layer/x86/interp_x86.cpp
+++ b/src/layer/x86/interp_x86.cpp
@@ -14,8 +14,6 @@
 
 #include "interp_x86.h"
 
-#include <math.h>
-
 #if __SSE2__
 #include <emmintrin.h>
 #if __AVX__
diff --git a/src/layer/x86/layernorm_x86.cpp b/src/layer/x86/layernorm_x86.cpp
index ba293fb95c6..21840c6b3d2 100644
--- a/src/layer/x86/layernorm_x86.cpp
+++ b/src/layer/x86/layernorm_x86.cpp
@@ -14,7 +14,7 @@
 
 #include "layernorm_x86.h"
 #include "x86_usability.h"
-#include <math.h>
+
 #include <cpu.h>
 
 #if __SSE2__
diff --git a/src/layer/x86/lrn_x86.cpp b/src/layer/x86/lrn_x86.cpp
index cfcc8777b45..b05c75996a1 100644
--- a/src/layer/x86/lrn_x86.cpp
+++ b/src/layer/x86/lrn_x86.cpp
@@ -18,8 +18,6 @@
 #include "avx_mathfun.h"
 #endif // __AVX__
 
-#include <math.h>
-
 namespace ncnn {
 
 int LRN_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp
index 21f528361e2..6ba218e53d3 100644
--- a/src/layer/x86/lstm_x86.cpp
+++ b/src/layer/x86/lstm_x86.cpp
@@ -24,7 +24,6 @@
 #include "x86_activation.h"
 #include "x86_usability.h"
 
-#include <math.h>
 #include "layer_type.h"
 
 namespace ncnn {
diff --git a/src/layer/x86/mish_x86.cpp b/src/layer/x86/mish_x86.cpp
index 2a45cabd2d9..e55a5e1f808 100644
--- a/src/layer/x86/mish_x86.cpp
+++ b/src/layer/x86/mish_x86.cpp
@@ -16,8 +16,6 @@
 
 #include "x86_activation.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 Mish_x86::Mish_x86()
diff --git a/src/layer/x86/quantize_x86.cpp b/src/layer/x86/quantize_x86.cpp
index e4a9157cd24..8f7ee993673 100644
--- a/src/layer/x86/quantize_x86.cpp
+++ b/src/layer/x86/quantize_x86.cpp
@@ -14,8 +14,6 @@
 
 #include "quantize_x86.h"
 
-#include <math.h>
-
 #if __SSE2__
 #include <emmintrin.h>
 #if __AVX__
diff --git a/src/layer/x86/roialign_x86.cpp b/src/layer/x86/roialign_x86.cpp
index 7c5be4b751e..0519376770f 100644
--- a/src/layer/x86/roialign_x86.cpp
+++ b/src/layer/x86/roialign_x86.cpp
@@ -14,8 +14,6 @@
 
 #include "roialign_x86.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 // adapted from detectron2
diff --git a/src/layer/x86/sigmoid_x86.cpp b/src/layer/x86/sigmoid_x86.cpp
index ed55d20859b..0cf44f84591 100644
--- a/src/layer/x86/sigmoid_x86.cpp
+++ b/src/layer/x86/sigmoid_x86.cpp
@@ -26,8 +26,6 @@
 #endif // __AVX__
 #endif // __SSE2__
 
-#include <math.h>
-
 namespace ncnn {
 
 Sigmoid_x86::Sigmoid_x86()
diff --git a/src/layer/x86/softmax_x86.cpp b/src/layer/x86/softmax_x86.cpp
index 07e7c535af2..41e5bd25d2e 100644
--- a/src/layer/x86/softmax_x86.cpp
+++ b/src/layer/x86/softmax_x86.cpp
@@ -15,7 +15,6 @@
 #include "softmax_x86.h"
 
 #include <float.h>
-#include <math.h>
 
 #if __SSE2__
 #include <emmintrin.h>
diff --git a/src/layer/x86/swish_x86.cpp b/src/layer/x86/swish_x86.cpp
index 73a074fb9ad..d8ae2695016 100644
--- a/src/layer/x86/swish_x86.cpp
+++ b/src/layer/x86/swish_x86.cpp
@@ -26,8 +26,6 @@
 #endif // __AVX__
 #endif // __SSE2__
 
-#include <math.h>
-
 namespace ncnn {
 
 Swish_x86::Swish_x86()
diff --git a/src/layer/x86/tanh_x86.cpp b/src/layer/x86/tanh_x86.cpp
index 2cebf19c2d3..bf94450e9fb 100644
--- a/src/layer/x86/tanh_x86.cpp
+++ b/src/layer/x86/tanh_x86.cpp
@@ -16,8 +16,6 @@
 
 #include "x86_activation.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 TanH_x86::TanH_x86()
diff --git a/src/layer/x86/unaryop_x86.cpp b/src/layer/x86/unaryop_x86.cpp
index 8629ab2093b..1ccd50d601a 100644
--- a/src/layer/x86/unaryop_x86.cpp
+++ b/src/layer/x86/unaryop_x86.cpp
@@ -14,9 +14,8 @@
 
 #include "unaryop_x86.h"
 
-#include <fenv.h>
+// #include <fenv.h>
 #include <float.h>
-#include <math.h>
 
 #if __SSE2__
 #include <emmintrin.h>
diff --git a/src/layer/x86/x86_activation.h b/src/layer/x86/x86_activation.h
index b02b8ee9a46..691bc65ee4c 100644
--- a/src/layer/x86/x86_activation.h
+++ b/src/layer/x86/x86_activation.h
@@ -15,7 +15,6 @@
 #ifndef X86_ACTIVATION_H
 #define X86_ACTIVATION_H
 
-#include <math.h>
 #include "mat.h"
 #include "fused_activation.h"
 #include "x86_usability.h"
diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h
index 1571cdf4928..9cb826fa2b1 100644
--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -15,7 +15,6 @@
 #ifndef X86_USABILITY_H
 #define X86_USABILITY_H
 
-#include <math.h>
 #if __SSE2__
 #include <emmintrin.h>
 #if __SSE4_1__
diff --git a/src/layer/x86/yolov3detectionoutput_x86.cpp b/src/layer/x86/yolov3detectionoutput_x86.cpp
index 10f26945004..175d7343524 100644
--- a/src/layer/x86/yolov3detectionoutput_x86.cpp
+++ b/src/layer/x86/yolov3detectionoutput_x86.cpp
@@ -18,7 +18,6 @@
 #include "yolov3detectionoutput_x86.h"
 
 #include <float.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/layer/yolodetectionoutput.cpp b/src/layer/yolodetectionoutput.cpp
index 967b14751f8..9b9ba7dc289 100644
--- a/src/layer/yolodetectionoutput.cpp
+++ b/src/layer/yolodetectionoutput.cpp
@@ -16,8 +16,6 @@
 
 #include "layer_type.h"
 
-#include <math.h>
-
 namespace ncnn {
 
 YoloDetectionOutput::YoloDetectionOutput()
diff --git a/src/layer/yolov3detectionoutput.cpp b/src/layer/yolov3detectionoutput.cpp
index 0cda9616746..494fb6d186a 100644
--- a/src/layer/yolov3detectionoutput.cpp
+++ b/src/layer/yolov3detectionoutput.cpp
@@ -17,7 +17,6 @@
 #include "layer_type.h"
 
 #include <float.h>
-#include <math.h>
 
 namespace ncnn {
 
diff --git a/src/mat.cpp b/src/mat.cpp
index 6e1cd702522..f758df41d40 100644
--- a/src/mat.cpp
+++ b/src/mat.cpp
@@ -21,8 +21,6 @@
 #include "layer.h"
 #include "layer_type.h"
 
-#include <math.h>
-
 #if NCNN_VULKAN
 #if NCNN_PLATFORM_API
 #if __ANDROID_API__ >= 26
diff --git a/src/mat_pixel.cpp b/src/mat_pixel.cpp
index ce9d4c479e0..221c7e5b2f8 100644
--- a/src/mat_pixel.cpp
+++ b/src/mat_pixel.cpp
@@ -15,7 +15,7 @@
 #include "mat.h"
 
 #include <limits.h>
-#include <math.h>
+
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/mat_pixel_affine.cpp b/src/mat_pixel_affine.cpp
index c2abe363d96..934fe22b1d5 100644
--- a/src/mat_pixel_affine.cpp
+++ b/src/mat_pixel_affine.cpp
@@ -17,7 +17,7 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 #include <limits.h>
-#include <math.h>
+
 #include "platform.h"
 
 namespace ncnn {
diff --git a/src/mat_pixel_resize.cpp b/src/mat_pixel_resize.cpp
index 7d171338469..e8f138d2a54 100644
--- a/src/mat_pixel_resize.cpp
+++ b/src/mat_pixel_resize.cpp
@@ -15,7 +15,7 @@
 #include "mat.h"
 
 #include <limits.h>
-#include <math.h>
+
 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON
diff --git a/src/pipeline.cpp b/src/pipeline.cpp
index efdaec80bde..8aed60e4803 100644
--- a/src/pipeline.cpp
+++ b/src/pipeline.cpp
@@ -19,8 +19,6 @@
 #include "pipelinecache.h"
 #include "option.h"
 
-#include <math.h>
-
 #if __ANDROID_API__ >= 26
 #include <android/hardware_buffer.h>
 #endif // __ANDROID_API__ >= 26
diff --git a/src/platform.h.in b/src/platform.h.in
index 0ae8f708817..be1dd508388 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -20,6 +20,7 @@
 #cmakedefine01 NCNN_SIMPLEOCV
 #cmakedefine01 NCNN_SIMPLEOMP
 #cmakedefine01 NCNN_SIMPLESTL
+#cmakedefine01 NCNN_SIMPLEMATH
 #cmakedefine01 NCNN_THREADS
 #cmakedefine01 NCNN_BENCHMARK
 #cmakedefine01 NCNN_C_API
@@ -245,6 +246,14 @@ private:
 #include <string>
 #endif
 
+// simplemath
+#if NCNN_SIMPLEMATH
+#include "simplemath.h"
+#else
+#include <math.h>
+#include <fenv.h>
+#endif
+
 #endif // __cplusplus
 
 #if NCNN_STDIO
diff --git a/src/simplemath.cpp b/src/simplemath.cpp
new file mode 100644
index 00000000000..d48d23e3c20
--- /dev/null
+++ b/src/simplemath.cpp
@@ -0,0 +1,622 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#include "simplemath.h"
+#define __HI(X)       *(1 + (short*)&x)
+#define __LO(X)       *(short*)&x
+#define INFINITY      (1.0 / 0)
+#define FE_TONEAREST  0
+#define FE_DOWNWARD   1024
+#define FE_UPWARD     2048
+#define FE_TOWARDZERO 3072
+
+/*
+* ====================================================
+* some useful constants
+* ====================================================
+*/
+static const float PI = 3.14159265358979323846;
+static const float PI_2 = 1.57079632679489661923; /* PI/2 */
+static const float E = 2.71828182845904523536;
+
+/* re-interpret the bit pattern of a uint32 as an IEEE-754 float */
+static float uint32_as_float(uint32_t a)
+{
+    float r;
+    float* rp = &r;
+    uint32_t* ap = &a;
+
+    *rp = *(float*)ap;
+
+    return r;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* Discontinuous function
+* ====================================================
+*/
+float fabs(float x)
+{
+    return x > 0 ? x : -x;
+}
+
+float fabsf(float x)
+{
+    return fabs(x);
+}
+
+float fmod(float numer, float denom)
+{
+    if (denom == 0.0)
+    {
+        return numer;
+    }
+    if (numer <= denom)
+    {
+        return numer;
+    }
+
+    int quotient = static_cast<int>(numer / denom);
+    return numer - quotient * denom;
+}
+
+float floor(float x)
+{
+    int intValue = static_cast<int>(x);
+    if (x < 0 && x != intValue)
+    {
+        intValue -= 1;
+    }
+    return intValue;
+}
+
+float floorf(float x)
+{
+    return floor(x);
+}
+
+float round(float x)
+{
+    float ret = x > 0 ? floor(x + 0.5) : ceil(x - 0.5);
+    return ret;
+}
+
+float roundf(float x)
+{
+    return round(x);
+}
+
+float ceilf(float x)
+{
+    return ceil(x);
+}
+
+float ceil(float x)
+{
+    int intValue = static_cast<int>(x);
+    if (x == intValue)
+    {
+        return x;
+    }
+    return floor(x + 1);
+}
+
+float fmaxf(float x, float y)
+{
+    return x > y ? x : y;
+}
+
+float truncf(float x)
+{
+    int intValue = static_cast<int>(x);
+    return static_cast<float>(intValue);
+}
+
+float frac(float x)
+{
+    return x - floor(x);
+}
+
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+
+/*
+    modify from https://developer.download.nvidia.cn/cg/sin.html
+*/
+float sinf(float a)
+{
+    const int x = 0;
+    const int y = 1;
+    const int z = 2;
+    const int w = 3;
+
+    float c0[4] = {0.0, 0.5, 1.0, 0.0};
+    float c1[4] = {0.25, -9.0, 0.75, 0.159154943091};
+    float c2[4] = {24.9808039603, -24.9808039603, -60.1458091736, 60.1458091736};
+    float c3[4] = {85.4537887573, -85.4537887573, -64.9393539429, 64.9393539429};
+    float c4[4] = {19.7392082214, -19.7392082214, -1.0, 1.0};
+    float r0[3], r1[3], r2[3];
+
+    // r1.x = c1.w * a - c1.x
+    r1[x] = c1[w] * a - c1[x];
+    // r1.y  = frac( r1.x );
+    r1[y] = frac(r1[x]);
+    // r2.x  = (float) ( r1.y < c1.x );
+    r2[x] = (float)(r1[y] < c1[x]);
+    // r2.yz = (float2) ( r1.yy >= c1.yz );
+    r2[y] = (float)(r1[y] >= c1[y]);
+    r2[z] = (float)(r1[y] >= c1[z]);
+    // r2.y  = dot( r2, c4.zwz );
+    r2[y] = r2[x] * c4[z] + r2[y] * c4[w] + r2[z] * c4[z];
+
+    // r0 = c0.xyz - r1.yyy
+    r0[x] = c0[x] - r1[y];
+    r0[y] = c0[y] - r1[y];
+    r0[z] = c0[z] - r1[y];
+
+    // r0 = r0 * r0
+    r0[x] = r0[x] * r0[x];
+    r0[y] = r0[y] * r0[y];
+    r0[z] = r0[z] * r0[z];
+
+    // r1 = c2.xyx * r0 + c2.zwz
+    r1[x] = c2[x] * r0[x] + c2[z];
+    r1[y] = c2[y] * r0[y] + c2[w];
+    r1[z] = c2[x] * r0[z] + c2[z];
+
+    // r1 = r1 * r0 + c3.xyx
+    r1[x] = r1[x] * r0[x] + c3[x];
+    r1[y] = r1[y] * r0[y] + c3[y];
+    r1[z] = r1[z] * r0[z] + c3[x];
+
+    // r1 = r1 * r0 + c3.zwz
+    r1[x] = r1[x] * r0[x] + c3[z];
+    r1[y] = r1[y] * r0[y] + c3[w];
+    r1[z] = r1[z] * r0[z] + c3[z];
+
+    // r1 = r1 * r0 + c4.xyx
+    r1[x] = r1[x] * r0[x] + c4[x];
+    r1[y] = r1[y] * r0[y] + c4[y];
+    r1[z] = r1[z] * r0[z] + c4[x];
+
+    // r1 = r1 * r0 + c4.zwz
+    r1[x] = r1[x] * r0[x] + c4[z];
+    r1[y] = r1[y] * r0[y] + c4[w];
+    r1[z] = r1[z] * r0[z] + c4[z];
+
+    //r0.x = dot(r1, -r2)
+    r0[x] = -(r1[x] * r2[x] + r1[y] * r2[y] + r1[z] * r2[z]);
+
+    return r0[x];
+}
+
+float cosf(float x)
+{
+    return sinf(PI_2 + x);
+}
+
+float tanf(float x)
+{
+    return sinf(x) / cosf(x);
+}
+
+/* copy from https://developer.download.nvidia.cn/cg/asin.html */
+float asinf(float x)
+{
+    float negate = float(x < 0);
+    x = fabs(x);
+    float ret = -0.0187293;
+    ret *= x;
+    ret += 0.0742610;
+    ret *= x;
+    ret -= 0.2121144;
+    ret *= x;
+    ret += 1.5707288;
+    ret = PI * 0.5 - sqrt(1.0 - x) * ret;
+    return ret - 2 * negate * ret;
+}
+
+/* copy from https://developer.download.nvidia.cn/cg/acos.html */
+float acosf(float x)
+{
+    float negate = float(x < 0);
+    x = fabs(x);
+    float ret = -0.0187293;
+    ret = ret * x;
+    ret = ret + 0.0742610;
+    ret = ret * x;
+    ret = ret - 0.2121144;
+    ret = ret * x;
+    ret = ret + 1.5707288;
+    ret = ret * sqrt(1.0 - x);
+    ret = ret - 2 * negate * ret;
+    return negate * PI + ret;
+}
+
+/* copy from https://developer.download.nvidia.cn/cg/atan.html */
+float atanf(float a)
+{
+    if (a < 0)
+    {
+        return -atanf(-a);
+    }
+    if (a > 1)
+    {
+        return PI_2 - atanf(1 / a);
+    }
+    float s = a * a;
+    float r = 0.0027856871020048857;
+
+    r = r * s - 0.015866000205278397;
+    r = r * s + 0.042472220957279205;
+    r = r * s - 0.07497530430555344f;
+    r = r * s + 0.10644879937171936;
+    r = r * s - 0.14207030832767487;
+    r = r * s + 0.19993454217910767f;
+    r = r * s - 0.33333146572113037f;
+    r = r * s;
+    return r * a + a;
+}
+
+float atan2f(float y, float x)
+{
+    if (x == 0 && y == 0)
+    {
+        // error
+        return 0;
+    }
+    if (y == 0)
+    {
+        return x > 0 ? 0 : PI;
+    }
+    if (x == 0)
+    {
+        return copysignf(PI_2, y);
+    }
+
+    if (x > 0 && y > 0)
+    {
+        return atanf(y / x);
+    }
+    else if (x < 0 && y > 0)
+    {
+        return PI - atanf(y / -x);
+    }
+    else if (x > 0 && y < 0)
+    {
+        return -atanf(-y / x);
+    }
+    else
+    {
+        return -PI + atanf(-y / -x);
+    }
+}
+
+float tanhf(float v)
+{
+    if (v >= 8 || v <= -8)
+    {
+        return copysignf(1, v);
+    }
+    float exp2v = expf(2 * v);
+    return (exp2v - 1) / (exp2v + 1);
+}
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+
+float sqrtf(float x)
+{
+    return powf(x, 0.5);
+}
+
+float sqrt(float x)
+{
+    return sqrtf(x);
+}
+
+float powf(float x, float y)
+{
+    return expf(y * logf(x));
+}
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+
+/* copy and modify from https://zhuanlan.zhihu.com/p/541466411 */
+float logf(float x)
+{
+    static const float
+    ln2_hi
+    = 6.93147180369123816490e-01,        /* 3fe62e42 fee00000 */
+    ln2_lo = 1.90821492927058770002e-10, /* 3dea39ef 35793c76 */
+    two25 = 3.3554432e+07,
+    Lg1 = 6.666666666666735130e-01, /* 3FE55555 55555593 */
+    Lg2 = 3.999999999940941908e-01, /* 3FD99999 9997FA04 */
+    Lg3 = 2.857142874366239149e-01, /* 3FD24924 94229359 */
+    Lg4 = 2.222219843214978396e-01, /* 3FCC71C5 1D8E78AF */
+    Lg5 = 1.818357216161805012e-01, /* 3FC74664 96CB03DE */
+    Lg6 = 1.531383769920937332e-01, /* 3FC39A09 D078C69F */
+    Lg7 = 1.479819860511658591e-01; /* 3FC2F112 DF3E5244 */
+
+    static float zero = 0.0;
+    float f, s, z, R, w, t1, t2, dk;
+    short k, hx, i;
+    unsigned short lx;
+
+    hx = __HI(x); /* high word of x */
+    lx = __LO(x); /* low  word of x */
+
+    k = 0;
+    if (hx < 0x0080)
+    {   /* x < 2**-126 */
+        if (((hx & 0x7fff) | lx) == 0)
+            return -two25 / zero;          /* log(+-0)=-inf */
+        if (hx < 0) return (x - x) / zero; /* log(-#) = NaN */
+        k -= 25;
+        x *= two25;   /* subnormal number, scale up x */
+        hx = __HI(x); /* high word of x */
+    }
+
+    if (hx >= 0x7f80) return x + x;
+    k += (hx >> 7) - 127;
+    hx &= 0x007f;
+    i = (hx + 0x4b) & 0x0080;
+    __HI(x) = hx | (i ^ 0x3f80); /* normalize x or x/2 */
+    k += (i >> 7);
+    f = x - 1.0f;
+
+    s = f / (2.0f + f);
+    dk = (float)k;
+    z = s * s;
+    w = z * z;
+    t1 = w * (Lg2 + w * (Lg4 + w * Lg6));
+    t2 = z * (Lg1 + w * (Lg3 + w * (Lg5 + w * Lg7)));
+    R = t2 + t1;
+    if (k == 0)
+        return f - s * (f - R);
+    else
+        return dk * ln2_hi - ((s * (f - R) - dk * ln2_lo) - f);
+}
+
+/* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */
+float expf(float a)
+{
+    if (a < 0)
+    {
+        float tmp = expf(-a);
+
+        float ret = 1 / tmp;
+
+        return ret;
+    }
+    float f, r, j;
+    int i;
+
+    // exp(a) = 2**i * exp(f); i = rintf (a / log(2))
+    j = 1.442695f * a;
+    j = round(j) + 12582912.f; // There is a bug, and the program lives on it.
+    j = j - 12582912.f;
+    // j = fmaf(1.442695f, a, 12582912.f) - 12582912.f; // 0x1.715476p0, 0x1.8p23
+    f = fmaf(j, -6.93145752e-1f, a); // -0x1.62e400p-1  // log_2_hi
+    f = fmaf(j, -1.42860677e-6f, f); // -0x1.7f7d1cp-20 // log_2_lo
+    i = (int)j;
+    // approximate r = exp(f) on interval [-log(2)/2, +log(2)/2]
+    r = 1.37805939e-3f;             // 0x1.694000p-10
+    r = fmaf(r, f, 8.37312452e-3f); // 0x1.125edcp-7
+    r = fmaf(r, f, 4.16695364e-2f); // 0x1.555b5ap-5
+    r = fmaf(r, f, 1.66664720e-1f); // 0x1.555450p-3
+    r = fmaf(r, f, 4.99999851e-1f); // 0x1.fffff6p-2
+    r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0
+    r = fmaf(r, f, 1.00000000e+0f); // 0x1.000000p+0
+
+    float s, t;
+    uint32_t ia;
+    // exp(a) = 2**i * r
+    ia = (i > 0) ? 0 : 0x83000000u;
+    s = uint32_as_float(0x7f000000u + ia);
+    t = uint32_as_float(((uint32_t)i << 23) - ia);
+    r = r * s;
+    r = r * t;
+
+    // handle special cases: severe overflow / underflow
+    if (fabsf(a) >= 104.0f) r = (a > 0) ? INFINITY : 0.0f;
+
+    return r;
+}
+
+float frexp(float x, int* y)
+{
+    int hx, k;
+    hx = __HI(x);
+    k = (hx >> 7) & 0x00ff;
+    k = k - 127;
+    __HI(x) = hx & 0x807f;
+    __HI(x) = __HI(x) | 0x3f80;
+
+    *y = k + 1; // y in [1/2, 1)
+    return x / 2;
+}
+
+float log(float x)
+{
+    return logf(x);
+}
+
+float log10f(float x)
+{
+    static const float ln10 = 2.3025850929940456840179914546844;
+    return logf(x) / ln10;
+}
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+
+/* copy from https://stackoverflow.com/questions/35148198/efficient-faithfully-rounded-implementation-of-error-function-erff */
+float erf(float a)
+{
+    float r, s, t, u;
+
+    t = fabsf(a);
+    s = a * a;
+    if (t > 0.927734375f)
+    {   // 475/512
+        // maximum error 0.99527 ulp
+        r = fmaf(-1.72853470e-5f, t, 3.83197126e-4f); // -0x1.220000p-16,0x1.91cfb2p-12
+        u = fmaf(-3.88396438e-3f, t, 2.42546219e-2f); // -0x1.fd1438p-9, 0x1.8d6342p-6
+        r = fmaf(r, s, u);
+        r = fmaf(r, t, -1.06777877e-1f); // -0x1.b55cb8p-4
+        r = fmaf(r, t, -6.34846687e-1f); // -0x1.450aa0p-1
+        r = fmaf(r, t, -1.28717512e-1f); // -0x1.079d0cp-3
+        r = fmaf(r, t, -t);
+        r = 1.0f - expf(r);
+        r = copysignf(r, a);
+    }
+    else
+    {
+        // maximum error 0.98929 ulp
+        r = -5.96761703e-4f;             // -0x1.38e000p-11
+        r = fmaf(r, s, 4.99119423e-3f);  //  0x1.471a58p-8
+        r = fmaf(r, s, -2.67681349e-2f); // -0x1.b691b2p-6
+        r = fmaf(r, s, 1.12819925e-1f);  //  0x1.ce1c44p-4
+        r = fmaf(r, s, -3.76125336e-1f); // -0x1.812700p-2
+        r = fmaf(r, s, 1.28379166e-1f);  //  0x1.06eba8p-3
+        r = fmaf(r, a, a);
+    }
+    return r;
+}
+
+float erfcf(float x)
+{
+    return 1.0 - erf(x);
+}
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+
+int msb(unsigned int v)
+{
+    static const int pos[32] = {0, 1, 28, 2, 29, 14, 24, 3,
+                                30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19,
+                                16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+                               };
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v = (v >> 1) + 1;
+    return pos[(v * 0x077CB531UL) >> 27];
+}
+
+float fmaf(float x, float y, float z)
+{
+    float tmp = x * y;
+    float ret = tmp + z;
+    return ret;
+}
+
+float copysignf(float x, float y)
+{
+    return fabsf(x) * (y > 0 ? 1 : -1);
+}
+
+int round_mode = 0;
+void fesetround(int mode)
+{
+    round_mode = mode;
+}
+
+int fegetround()
+{
+    return round_mode;
+}
+
+float nearbyintf(float x)
+{
+    int intPart = static_cast<int>(x);
+    float floatPart = fabs(x - intPart);
+    if (floatPart == 0)
+    {
+        return x;
+    }
+
+    if (x > 0)
+    {
+        if (round_mode == FE_DOWNWARD || round_mode == FE_TOWARDZERO)
+        {
+            return static_cast<float>(intPart);
+        }
+        if (round_mode == FE_UPWARD)
+        {
+            return static_cast<float>(intPart) + 1.0;
+        }
+        if (round_mode == FE_TONEAREST)
+        {
+            if (floatPart == 0.5)
+            {
+                return intPart % 2 == 0 ? static_cast<float>(intPart) : static_cast<float>(intPart) + 1;
+            }
+            return round(x);
+        }
+    }
+    if (x < 0)
+    {
+        if (round_mode == FE_UPWARD || round_mode == FE_TOWARDZERO)
+        {
+            return static_cast<float>(intPart);
+        }
+        if (round_mode == FE_DOWNWARD)
+        {
+            return static_cast<float>(intPart) - 1.0;
+        }
+        if (round_mode == FE_TONEAREST)
+        {
+            if (floatPart == 0.5)
+            {
+                return intPart % 2 == 0 ? static_cast<float>(intPart) : static_cast<float>(intPart) - 1;
+            }
+            return round(x);
+        }
+    }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
diff --git a/src/simplemath.h b/src/simplemath.h
new file mode 100644
index 00000000000..fd7fa6964eb
--- /dev/null
+++ b/src/simplemath.h
@@ -0,0 +1,102 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef NCNN_SIMPLEMATH_H
+#define NCNN_SIMPLEMATH_H
+
+#include "platform.h"
+
+#if NCNN_SIMPLEMATH
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+* ====================================================
+* discrete functions
+* ====================================================
+*/
+NCNN_EXPORT float fabs(float);
+NCNN_EXPORT float fabsf(float);
+NCNN_EXPORT float fmod(float, float);
+NCNN_EXPORT float floor(float);
+NCNN_EXPORT float floorf(float);
+NCNN_EXPORT float round(float);
+NCNN_EXPORT float roundf(float);
+NCNN_EXPORT float ceil(float);
+NCNN_EXPORT float ceilf(float);
+NCNN_EXPORT float fmaxf(float, float);
+NCNN_EXPORT float truncf(float);
+NCNN_EXPORT float frac(float);
+/*
+* ====================================================
+* trigonometric functions
+* ====================================================
+*/
+NCNN_EXPORT float sinf(float);
+NCNN_EXPORT float cosf(float);
+NCNN_EXPORT float tanf(float);
+NCNN_EXPORT float asinf(float);
+NCNN_EXPORT float acosf(float);
+NCNN_EXPORT float atanf(float);
+NCNN_EXPORT float atan2f(float, float);
+NCNN_EXPORT float tanhf(float);
+
+/*
+* ====================================================
+* power functions
+* ====================================================
+*/
+NCNN_EXPORT float sqrtf(float);
+NCNN_EXPORT float sqrt(float);
+NCNN_EXPORT float powf(float, float);
+
+/*
+* ====================================================
+* exponential and logarithm functions
+* ====================================================
+*/
+NCNN_EXPORT float expf(float);
+NCNN_EXPORT float frexp(float, int*);
+NCNN_EXPORT float logf(float);
+NCNN_EXPORT float log(float);
+NCNN_EXPORT float log10f(float);
+
+/*
+* ====================================================
+* probability functions
+* ====================================================
+*/
+NCNN_EXPORT float erf(float);
+NCNN_EXPORT float erfcf(float);
+
+/*
+* ====================================================
+* other functions
+* ====================================================
+*/
+NCNN_EXPORT int msb(unsigned int);
+NCNN_EXPORT float fmaf(float, float, float);
+NCNN_EXPORT float copysignf(float, float);
+NCNN_EXPORT void fesetround(int);
+NCNN_EXPORT int fegetround();
+NCNN_EXPORT float nearbyintf(float);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // NCNN_SIMPLEMATH
+
+#endif // NCNN_SIMPLEMATH_H
\ No newline at end of file
diff --git a/src/stb_image.h b/src/stb_image.h
index 8d9fc9c581f..1b4b337328e 100644
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -589,7 +589,7 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp, pow
+  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
diff --git a/src/stb_image_write.h b/src/stb_image_write.h
index e4b32ed1bc3..aa397c09d53 100644
--- a/src/stb_image_write.h
+++ b/src/stb_image_write.h
@@ -214,7 +214,7 @@ STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
 #include <stdarg.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
+
 
 #if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 // ok
diff --git a/tests/test_mat_pixel_affine.cpp b/tests/test_mat_pixel_affine.cpp
index 817b0f57a3c..94ea366f9e7 100644
--- a/tests/test_mat_pixel_affine.cpp
+++ b/tests/test_mat_pixel_affine.cpp
@@ -15,7 +15,6 @@
 #include "mat.h"
 #include "prng.h"
 
-#include <math.h>
 #include <string.h>
 
 static struct prng_rand_t g_prng_rand_state;
diff --git a/tests/test_mat_pixel_resize.cpp b/tests/test_mat_pixel_resize.cpp
index 725c30e0bdf..38b8c5ab356 100644
--- a/tests/test_mat_pixel_resize.cpp
+++ b/tests/test_mat_pixel_resize.cpp
@@ -15,7 +15,6 @@
 #include "mat.h"
 #include "prng.h"
 
-#include <math.h>
 #include <string.h>
 
 static struct prng_rand_t g_prng_rand_state;
diff --git a/tests/testutil.h b/tests/testutil.h
index b879fa527fb..0794bdd463d 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -20,7 +20,6 @@
 #include "mat.h"
 #include "prng.h"
 
-#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 
diff --git a/toolchains/aarch64-linux-gnu-c.toolchain.cmake b/toolchains/aarch64-linux-gnu-c.toolchain.cmake
index 07b39de87b6..cde92c07070 100644
--- a/toolchains/aarch64-linux-gnu-c.toolchain.cmake
+++ b/toolchains/aarch64-linux-gnu-c.toolchain.cmake
@@ -11,7 +11,7 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 set(CMAKE_C_FLAGS "-march=armv8-a")
 set(CMAKE_CXX_FLAGS "-march=armv8-a")
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -nodefaultlibs -fno-builtin -fno-stack-protector -nostdinc++ -lc")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -nodefaultlibs -fno-builtin -fno-stack-protector -nostdinc++ -mno-outline-atomics -lc")
 
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")