diff --git a/ffmpeg/JNI/dav1d/.gitignore b/ffmpeg/JNI/dav1d/.gitignore new file mode 100644 index 000000000..2bbd7c48a --- /dev/null +++ b/ffmpeg/JNI/dav1d/.gitignore @@ -0,0 +1,8 @@ +/build* +/Session.vim +[._]*.swp +*~ +tags +.DS_Store +/tests/dav1d-test-data +*.snap diff --git a/ffmpeg/JNI/dav1d/.gitlab-ci.yml b/ffmpeg/JNI/dav1d/.gitlab-ci.yml new file mode 100644 index 000000000..c921b6a12 --- /dev/null +++ b/ffmpeg/JNI/dav1d/.gitlab-ci.yml @@ -0,0 +1,579 @@ +stages: + - style + - build + - test + +.debian-amd64-common: + image: registry.videolan.org/dav1d-debian-unstable:20200602183013 + stage: build + tags: + - docker + - amd64 + +.debian-llvm-mingw-common: + image: registry.videolan.org/vlc-debian-llvm-mingw:20190218133533 + stage: build + tags: + - docker + - amd64 + +.debian-aarch64-common: + image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017 + stage: build + tags: + - docker + - aarch64 + +.debian-armv7-common: + image: registry.videolan.org/dav1d-debian-unstable-armv7:20190202101732 + stage: build + tags: + - docker + - armv7 + +.debian-ppc64le-common: + image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121 + stage: build + tags: + - docker + - ppc64le + +.ubuntu-common: + image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340 + stage: build + tags: + - docker + - amd64 + +.android-common: + image: registry.videolan.org/vlc-debian-android:20200323093226 + stage: build + tags: + - docker + - amd64 + + +style-check: + extends: .debian-amd64-common + stage: style + script: + - git grep -I -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && echo "Trailing whitespace" && exit 1 + - git grep -I -n -i -e 'david' --and --not -e 'copyright' -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && echo "Misspelled dav1d" && exit 1 + - git grep -I -l -z "" -- . ':(exclude)*/compat/*' | while IFS= read -r -d '' i; do + if [ -n "$(tail -c 1 "$i")" ]; then + echo "No newline at end of $i"; + exit 1; + fi; + done + - git remote rm upstream 2> /dev/null || true + - git remote add upstream https://code.videolan.org/videolan/dav1d.git + - git fetch -q upstream master + - for i in $(git rev-list HEAD ^upstream/master); do + echo "Checking commit message of $i"; + msg="$(git log --format=%B -n 1 $i)"; + if [ -n "$(echo "$msg" | awk "NR==2")" ]; then + echo "Malformed commit message in $i, second line must be empty"; + exit 1; + fi; + if echo "$msg" | head -1 | grep -q '\.$'; then + echo "Malformed commit message in $i, trailing period in subject line"; + exit 1; + fi; + done + + +build-debian: + extends: .debian-amd64-common + tags: + - docker + - avx2 + - amd64 + script: + - meson build --buildtype release + --werror + - ninja -C build + - cd build && meson test -v + artifacts: + paths: + - build/ + expire_in: 1 day + +build-debian-static: + extends: .debian-amd64-common + script: + - meson build --buildtype release + --default-library static + --werror + - ninja -C build + - cd build && meson test -v + - nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -v " _*dav1d_") + +build-debian32: + extends: .debian-amd64-common + script: + - meson build --buildtype release + --werror + --cross-file package/crossfiles/i686-linux32.meson + - ninja -C build + - cd build && meson test -v + artifacts: + paths: + - build/ + expire_in: 1 day + +build-debian-examples: + extends: .debian-amd64-common + script: + - meson build --buildtype release + --werror + -Denable_examples=true + - ninja -C build + +build-win32: + extends: .debian-amd64-common + script: + - wineserver -p && wine wineboot + - meson build --buildtype release + --werror + --libdir lib + --prefix "$(pwd)/build/dav1d_install" + --cross-file package/crossfiles/i686-w64-mingw32.meson + -Ddefault_library=both + - ninja -C build + - ninja -C build install + - cd build && meson test -v + - i686-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_") + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + +build-win32-unaligned-stack: + extends: .debian-llvm-mingw-common + script: + - wineserver -p && wine wineboot + - meson build --buildtype release + --werror + --cross-file package/crossfiles/i686-w64-mingw32.meson + -Dstack_alignment=4 + - ninja -C build + - cd build && meson test -v + +build-win64: + extends: .debian-amd64-common + script: + - wineserver -p && wine wineboot + - meson build --buildtype release + --werror + --libdir lib + --prefix "$(pwd)/build/dav1d_install" + --cross-file package/crossfiles/x86_64-w64-mingw32.meson + -Ddefault_library=both + - ninja -C build + - ninja -C build install + - cd build && meson test -v + - x86_64-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_") + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + +build-win-arm32: + extends: .debian-llvm-mingw-common + script: + - meson build --buildtype release + --werror + --libdir lib + --prefix "$(pwd)/build/dav1d_install" + --cross-file /opt/crossfiles/armv7-w64-mingw32.meson + -Ddefault_library=both + - ninja -C build + - armv7-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_") + +build-win-arm64: + extends: .debian-llvm-mingw-common + script: + - meson build --buildtype release + --werror + --libdir lib + --prefix "$(pwd)/build/dav1d_install" + --cross-file /opt/crossfiles/aarch64-w64-mingw32.meson + -Ddefault_library=both + - ninja -C build + - ninja -C build install + - aarch64-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_") + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + +.build-android-common: + extends: .android-common + script: + - meson build --buildtype release + --werror + --libdir lib + --prefix "$(pwd)/build/dav1d_install" + --cross-file $CROSSFILE + -Ddefault_library=both + - ninja -C build + - ninja -C build install + +build-android-armv7: + extends: .build-android-common + variables: + CROSSFILE: package/crossfiles/arm-android.meson + except: + - tags + +build-android-aarch64: + extends: .build-android-common + variables: + CROSSFILE: package/crossfiles/aarch64-android.meson + except: + - tags + +build-android-armv7-release: + extends: build-android-armv7 + except: + only: + refs: + - tags@videolan/dav1d + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + +build-android-aarch64-release: + extends: build-android-aarch64 + except: + only: + refs: + - tags@videolan/dav1d + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - build/dav1d_install/ + expire_in: 1 week + +build-debian-aarch64: + extends: .debian-aarch64-common + script: + - meson build --buildtype debugoptimized + --werror + - ninja -C build + - cd build && meson test -v + +build-debian-aarch64-clang-5: + extends: .debian-aarch64-common + variables: + CC: clang-5.0 + CFLAGS: '-integrated-as' + script: + - meson build --buildtype release + - ninja -C build + - cd build && meson test -v + +build-macos: + stage: build + tags: + - macos + script: + - meson build --buildtype release + -Ddefault_library=both + --werror + - ninja -C build + - cd build && meson test -v + +build-debian-werror: + extends: .debian-aarch64-common + variables: + CC: clang-7 + script: + - meson build --buildtype debug + --werror + - ninja -C build + +build-debian-armv7: + extends: .debian-armv7-common + script: + - linux32 meson build --buildtype debugoptimized + --werror + - ninja -C build + - cd build && meson test -v + +build-debian-armv7-clang-5: + extends: .debian-armv7-common + variables: + CC: clang-5.0 + CFLAGS: '-integrated-as' + script: + - linux32 meson build --buildtype release + - ninja -C build + - cd build && meson test -v + +build-ubuntu-snap: + extends: .ubuntu-common + script: + - cd package/snap && snapcraft snap + - | + if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then + echo $SNAP_LOGIN | base64 --decode | snapcraft login --with - + snapcraft push dav1d_*.snap --release edge + snapcraft logout + fi + artifacts: + name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG" + paths: + - package/snap/dav1d_*.snap + expire_in: 1 week + allow_failure: true + +build-debian-ppc64le: + extends: .debian-ppc64le-common + script: + - meson build --buildtype release + --werror + - ninja -C build + - cd build && meson test -v + + +.test-common: + stage: test + cache: + key: testdata.git-20190215 + paths: + - cache/dav1d-test-data.git/ + before_script: + - test -d cache || mkdir cache + - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master + - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git + - git clone cache/dav1d-test-data.git tests/dav1d-test-data + dependencies: [] + +.test-asm-common: + extends: + - .debian-amd64-common + - .test-common + tags: + - docker + - amd64 + - avx2 + script: + - meson configure build -Dtestdata_tests=true + - cd build + - exit_code=0 + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2" || exit_code=$((exit_code + $?)) + - if [ $exit_code -ne 0 ]; then exit $exit_code; fi + +test-debian: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + script: + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + -Db_coverage=true + - ninja -C build + - cd build && time meson test -v + - ninja coverage-html + - mv meson-logs/coveragereport ../coverage + - ninja coverage-xml + - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 | + grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } ' + coverage: '/^coverage: (\d+.\d+)$/' + artifacts: + expose_as: 'Coverage HTML report' + paths: + - coverage/ + reports: + cobertura: build/meson-logs/coverage.xml + +test-debian-asm: + extends: + - .test-asm-common + needs: ["build-debian"] + dependencies: ["build-debian"] + +test-debian32-asm: + extends: + - .test-asm-common + needs: ["build-debian32"] + dependencies: ["build-debian32"] + +test-debian-mt: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + dependencies: ["build-debian"] + script: + - meson configure build -Dtestdata_tests=true + - cd build + - exit_code=0 + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?)) + - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?)) + - if [ $exit_code -ne 0 ]; then exit $exit_code; fi + +test-debian-unaligned-stack: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + tags: + - docker + - avx2 + - amd64 + script: + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + -Dstack_alignment=16 + - ninja -C build + - cd build && time meson test -v + +test-debian-asan: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + variables: + ASAN_OPTIONS: 'detect_leaks=0' + script: + - meson build --buildtype debugoptimized + -Dtestdata_tests=true + -Dlogging=false + -Db_sanitize=address + -Denable_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + +test-debian-msan: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + variables: + MSAN_OPTIONS: 'exitcode=1' + CC: clang + script: + - meson build --buildtype debugoptimized + -Dtestdata_tests=true + -Dlogging=false + -Db_sanitize=memory + -Db_lundef=false + -Denable_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + +test-debian-ubsan: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-debian"] + variables: + UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1' + CC: clang + script: + - meson build --buildtype debugoptimized + -Dtestdata_tests=true + -Dlogging=false + -Db_sanitize=undefined + -Db_lundef=false + -Denable_asm=false + - ninja -C build + - cd build && time meson test -v --setup=sanitizer + +test-win64: + extends: + - .debian-amd64-common + - .test-common + needs: ["build-win64"] + tags: + - docker + - avx2 + - amd64 + script: + - wineserver -p && wine wineboot + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + --cross-file package/crossfiles/x86_64-w64-mingw32.meson + - ninja -C build + - cd build && time meson test -v + +test-debian-aarch64: + extends: + - .debian-aarch64-common + - .test-common + needs: ["build-debian-aarch64"] + script: + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + - ninja -C build + - cd build && time meson test -v + +test-debian-ppc64le: + extends: + - .debian-ppc64le-common + - .test-common + needs: ["build-debian-ppc64le"] + script: + - meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + - ninja -C build + - cd build && time meson test -v + +test-debian-armv7-clang-5: + extends: + - .debian-armv7-common + - .test-common + needs: ["build-debian-armv7-clang-5"] + variables: + CC: clang-5.0 + CFLAGS: '-integrated-as' + script: + - linux32 meson build --buildtype release + -Dtestdata_tests=true + -Dlogging=false + - ninja -C build + - cd build && time meson test -v + + +.pages-common: + extends: .debian-amd64-common + script: + - meson build --buildtype release + --werror + - ninja -C build doc/html + - mv build/doc/html public + artifacts: + paths: + - public + +build-pages: + extends: .pages-common + except: + refs: + - master + +pages: + extends: .pages-common + only: + refs: + - master + changes: + - include/dav1d/* diff --git a/ffmpeg/JNI/dav1d/NEWS b/ffmpeg/JNI/dav1d/NEWS index 46695fd7e..1294dc52c 100644 --- a/ffmpeg/JNI/dav1d/NEWS +++ b/ffmpeg/JNI/dav1d/NEWS @@ -1,3 +1,33 @@ +Changes for 0.7.1 'Frigatebird': +------------------------------ + +0.7.1 is a minor update on 0.7.0: + - ARM32 NEON optimizations for itxfm, which can give up to 28% speedup, and MSAC + - SSE2 optimizations for prep_bilin and prep_8tap + - AVX2 optimizations for MC scaled + - Fix a clamping issue in motion vector projection + - Fix an issue on some specific Haswell CPU on ipred_z AVX2 functions + - Improvements on the dav1dplay utility player to support resizing + + +Changes for 0.7.0 'Frigatebird': +------------------------------ + +0.7.0 is a major release for dav1d: + - Faster refmv implementation gaining up to 12% speed while -25% of RAM (Single Thread) + - 10b/12b ARM64 optimizations are mostly complete: + - ipred (paeth, smooth, dc, pal, filter, cfl) + - itxfm (only 10b) + - AVX2/SSSE3 for non-4:2:0 film grain and for mc.resize + - AVX2 for cfl4:4:4 + - AVX-512 CDEF filter + - ARM64 8b improvements for cfl_ac and itxfm + - ARM64 implementation for emu_edge in 8b/10b/12b + - ARM32 implementation for emu_edge in 8b + - Improvements on the dav1dplay utility player to support 10 bit, + non-4:2:0 pixel formats and film grain on the GPU + + Changes for 0.6.0 'Gyrfalcon': ------------------------------ diff --git a/ffmpeg/JNI/dav1d/builddir/.ninja_deps b/ffmpeg/JNI/dav1d/builddir/.ninja_deps deleted file mode 100644 index d6da49ae7..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/.ninja_deps and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/.ninja_log b/ffmpeg/JNI/dav1d/builddir/.ninja_log deleted file mode 100644 index 5ca13a284..000000000 --- a/ffmpeg/JNI/dav1d/builddir/.ninja_log +++ /dev/null @@ -1,107 +0,0 @@ -# ninja log v5 -1 134 1584948901715376510 src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o 63af13494d15f9e -1 199 1584948901779084976 src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o 57e06d2e8b8a986d -1 212 1584948901773226508 include/vcs_version.h 720dab2031fdd723 -1 234 1584948901815039592 src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o 4ac5d155afbcb60a -135 244 1584948901825083166 src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o 54afce5a6ebd5420 -234 307 1584948901888586245 src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o 2a411c45413fefeb -244 320 1584948901901022470 src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o 6692e2267b8bbf1e -212 379 1584948901960473837 src/25a6634@@dav1d_entrypoint@sta/lib.c.o 199037861e6ad90f -199 493 1584948902074129009 src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o fa6108b1cab13f0d -379 498 1584948902078574214 src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o c6905b7df8c0b378 -307 596 1584948902176437182 src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o 527c79441fca8395 -498 606 1584948902186764947 src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o 6e5791172d4a6d12 -493 703 1584948902284140586 src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o 506378dcde1797c7 -703 758 1584948902340028880 src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o 45b1bd1750ea59dc -758 797 1584948902378937784 src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o 54bfa84920242e6d -797 838 1584948902419225754 src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o cb71905026629629 -838 874 1584948902456382121 src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o fc7f31cdf6ac6866 -874 932 1584948902513985277 src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o 97f89106ddf15b2e -932 968 1584948902550096862 src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o a78d4f76c4cd24bf -320 1068 1584948902647783370 src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o 8e06c30e33d45a74 -596 1161 1584948902741902452 src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o c2e88fb1780dcb1b -1161 1261 1584948902842725034 src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o 52071763b5fb7c8c -606 1263 1584948902843761879 src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o 8620471446f1a24c -1263 1341 1584948902922273122 src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o c8ce7a0514dd6f4a -1068 1367 1584948902948149672 src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o dd63d26939d67b49 -1367 1472 1584948903053372842 src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o 8ea862ebcecc14a2 -1261 1485 1584948903064012494 src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o 2884ac8fec1268dc -968 1584 1584948903163358420 src/25a6634@@dav1d@sta/cdf.c.o 9fa40041e89c2397 -1472 1650 1584948903231730373 src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o 741537d11aadd8f6 -1341 1669 1584948903248710277 src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o 41382fed1ab53767 -1584 1705 1584948903286615965 src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o 1f7e42678bb55bf0 -1705 1822 1584948903392510393 src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o b74a0dcb55ddba0a -1650 1859 1584948903440845056 src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o f03ad3a8e9e5d87a -1859 1894 1584948903476364274 src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o b41dcd9bd9e1e2f5 -1894 1927 1584948903508419851 src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o 25a23a02ca5267eb -1927 1960 1584948903542059401 src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o 6246bd8d9842a39d -1960 1994 1584948903576063730 src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o 9ce4054292d62405 -1994 2029 1584948903610462476 src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o 23904c1926deefb4 -2029 2064 1584948903645956346 src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o 864659b4941d840d -2064 2119 1584948903699972440 src/25a6634@@dav1d@sta/data.c.o c97c3991b5c52447 -2119 2153 1584948903734869534 src/25a6634@@dav1d@sta/cpu.c.o c319040c90560143 -2153 2184 1584948903766132440 src/25a6634@@dav1d@sta/dequant_tables.c.o 98b7f39779894c3f -1485 2198 1584948903777209672 src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o a49e54bdd88ad78f -2198 2252 1584948903833839133 src/25a6634@@dav1d@sta/intra_edge.c.o 921abe41ade11d4a -2252 2334 1584948903915674470 src/25a6634@@dav1d@sta/getbits.c.o 7edd9c77eb0463eb -1669 2336 1584948903915988522 src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o d1b9576ced0ee792 -2334 2393 1584948903973850148 src/25a6634@@dav1d@sta/log.c.o ed43381c58dc7191 -1822 2432 1584948904012019086 src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o d5764b68221e0015 -2393 2502 1584948904083057236 src/25a6634@@dav1d@sta/msac.c.o 683ef2041609e4f7 -2336 2544 1584948904125387224 src/25a6634@@dav1d@sta/lf_mask.c.o 6326820028ef07d9 -2502 2593 1584948904174567562 src/25a6634@@dav1d@sta/picture.c.o 8073857895928851 -2593 2638 1584948904220475362 src/25a6634@@dav1d@sta/ref.c.o 1357cb8fe319538d -2544 2690 1584948904271216502 src/25a6634@@dav1d@sta/qm.c.o 3f6359d4c064cc1b -2690 2726 1584948904307960763 src/25a6634@@dav1d@sta/scan.c.o 9dd2404340150d4d -2726 2766 1584948904348759953 src/25a6634@@dav1d@sta/tables.c.o 5e5f4d508b28c29b -2766 2841 1584948904422572982 src/25a6634@@dav1d@sta/warpmv.c.o 8fbe369b6cc02b72 -2432 2855 1584948904433868019 src/25a6634@@dav1d@sta/obu.c.o 532939c420dedc6e -2841 2872 1584948904454410064 src/25a6634@@dav1d@sta/arm_cpu.c.o 1dda6294bf784f6a -2872 2908 1584948904491245148 src/25a6634@@dav1d@sta/arm_64_ipred.S.o 1bb8ad968f2aa0b1 -2908 2946 1584948904529304426 src/25a6634@@dav1d@sta/arm_64_cdef.S.o bc4f68ed4ef1fa4a -2855 2955 1584948904536297638 src/25a6634@@dav1d@sta/wedge.c.o f558f98ac1f399b5 -2946 2990 1584948904572818358 src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o 3a0ad77f72e591f -2990 3036 1584948904618300703 src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o 6aeb6615068e3bbb -2955 3048 1584948904630063973 src/25a6634@@dav1d@sta/arm_64_itx.S.o 21e1d3bc160da695 -3048 3083 1584948904665979461 src/25a6634@@dav1d@sta/arm_64_msac.S.o 353617e90a5feb52 -3036 3109 1584948904691642850 src/25a6634@@dav1d@sta/arm_64_mc.S.o 12bdcabecee75a2a -3083 3138 1584948904719926233 tools/f9d35d4@@dav1d_output@sta/output_output.c.o f24d74ba2e61804c -3109 3160 1584948904742150171 tools/f9d35d4@@dav1d_input@sta/input_input.c.o 63fef28132c33045 -3160 3216 1584948904798285359 tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o c961479845bef020 -3138 3223 1584948904804592066 tools/f9d35d4@@dav1d@exe/dav1d.c.o e663372975514dc9 -2638 3243 1584948904823108461 src/25a6634@@dav1d@sta/ref_mvs.c.o 7cf9413db12b89e2 -3216 3273 1584948904854510220 tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o 17146dd873f61bd2 -3243 3281 1584948904863250787 tools/f9d35d4@@dav1d_output@sta/output_null.c.o 799e46c7044fbba7 -3273 3285 1584948904869336372 tools/libdav1d_input.a d1d19e74400c63af -3223 3321 1584948904902946406 tools/f9d35d4@@dav1d_output@sta/output_md5.c.o 78839ae8fbbb6080 -3285 3335 1584948904916510195 tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o 18111b437e7df2e3 -3281 3336 1584948904918189031 tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o 437fb9d279e9395d -3336 3349 1584948904932959283 tools/libdav1d_output.a bfd1e8ba347863dc -3321 3408 1584948904990211551 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o c14bf3f2fed757f3 -3335 3424 1584948905005005629 tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o a6b5260fbee7f276 -3424 3539 1584948905120383885 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o 74622b2bd3d21aa1 -3349 3559 1584948905139162384 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o 5f77c640558b74f7 -3409 3571 1584948905151863556 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o bd69fbe58c0cfb15 -3571 3659 1584948905240602812 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o c9faeb5401644bda -3559 3672 1584948905252122144 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o 84b5a3753976f436 -3539 3712 1584948905293499168 tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o aa739c03a8364c81 -3659 3747 1584948905328092322 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o d48f538430ebe5c6 -3712 3865 1584948905445168732 tests/59830eb@@checkasm@exe/checkasm_msac.c.o c3b04750215fba44 -3672 3874 1584948905453727244 tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o 43e69bc880cd888e -3747 3903 1584948905483961705 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o b7e1b02b93437599 -2184 3912 1584948905490730646 src/25a6634@@dav1d@sta/decode.c.o 7c51ea4d4642b07e -3912 3937 1584948905520897311 src/libdav1d.a 70adcc6d3ba339f8 -3937 3969 1584948905552837065 tools/dav1d 840267a2e6502530 -3865 3987 1584948905568148256 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o 38c12806557f32a5 -3903 4027 1584948905608624537 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o b6e2f5cdd75acbc2 -3874 4059 1584948905639930259 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o 883b1b14b9f32683 -3969 4068 1584948905649021650 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o a3100295f88374d3 -4027 4080 1584948905661374067 tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o 5e90bb020839ec94 -4059 4093 1584948905676363368 tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o 21398b174c915c0e -4068 4112 1584948905694249629 tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o ef706d11756679b -4080 4130 1584948905711752152 tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o 53bf4f318fdf96c6 -4093 4136 1584948905718431021 tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o 4338f5f1f147ab3f -4130 4162 1584948905745600609 tests/dav1d_fuzzer_mt e068c7811100cda3 -4136 4168 1584948905751526156 tests/dav1d_fuzzer 8bf38da49ef16c7c -3987 4202 1584948905783451792 tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o 5d4ac59be93c6b36 -4202 4245 1584948905828566274 tests/checkasm b72edfee09127526 diff --git a/ffmpeg/JNI/dav1d/builddir/build.ninja b/ffmpeg/JNI/dav1d/builddir/build.ninja deleted file mode 100644 index 4fb0b2e19..000000000 --- a/ffmpeg/JNI/dav1d/builddir/build.ninja +++ /dev/null @@ -1,575 +0,0 @@ -# This is the build file for project "dav1d" -# It is autogenerated by the Meson build system. -# Do not edit by hand. - -ninja_required_version = 1.5.1 - -# Rules for compiling. - -rule c_COMPILER - command = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang $ARGS -MD -MQ $out -MF '$DEPFILE' -o $out -c $in - deps = gcc - depfile = $DEPFILE - description = Compiling C object $out. - -# Rules for linking. - -rule STATIC_LINKER - command = rm -f $out && /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-ar $LINK_ARGS $out $in - description = Linking static target $out. - -rule c_LINKER - command = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang $ARGS -o $out $in $LINK_ARGS - description = Linking target $out. - -# Other rules - -rule CUSTOM_COMMAND - command = $COMMAND - description = $DESC - restat = 1 - -rule REGENERATE_BUILD - command = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal regenerate /Users/zlin/workspace/mxcore/media_player/jni/dav1d /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir --backend ninja - description = Regenerating build files. - generator = 1 - -# Phony build target, always out of date - -build PHONY: phony - -# Build rules for targets - -build include/vcs_version.h: CUSTOM_COMMAND ../include/vcs_version.h.in | PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal vcstagger ../include/vcs_version.h.in include/vcs_version.h 0.4.0 /Users/zlin/workspace/mxcore/media_player/jni/dav1d/include @VCS_TAG@ '(.*)' /Library/Developer/CommandLineTools/usr/bin/git --git-dir /Users/zlin/workspace/mxcore/media_player/jni/dav1d/.git describe --tags --long --match '?.*.*' --always - description = Generating$ vcs_version.h$ with$ a$ custom$ command. - -build src/25a6634@@dav1d_entrypoint@sta/lib.c.o: c_COMPILER ../src/lib.c || include/vcs_version.h - DEPFILE = src/25a6634@@dav1d_entrypoint@sta/lib.c.o.d - ARGS = -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o: c_COMPILER ../src/thread_task.c || include/vcs_version.h - DEPFILE = src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o.d - ARGS = -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build src/libdav1d_entrypoint.a: STATIC_LINKER src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o - LINK_ARGS = csrD - -build src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o: c_COMPILER ../src/cdef_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o: c_COMPILER ../src/cdef_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o: c_COMPILER ../src/fg_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o: c_COMPILER ../src/film_grain_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o: c_COMPILER ../src/ipred_prepare_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o: c_COMPILER ../src/ipred_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o: c_COMPILER ../src/itx_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o: c_COMPILER ../src/lf_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o: c_COMPILER ../src/loopfilter_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o: c_COMPILER ../src/looprestoration_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o: c_COMPILER ../src/lr_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o: c_COMPILER ../src/mc_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o: c_COMPILER ../src/recon_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o: c_COMPILER ../src/arm/cdef_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o: c_COMPILER ../src/arm/ipred_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o: c_COMPILER ../src/arm/itx_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o: c_COMPILER ../src/arm/loopfilter_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o: c_COMPILER ../src/arm/looprestoration_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o: c_COMPILER ../src/arm/mc_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build src/libdav1d_bitdepth_8.a: STATIC_LINKER src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o - LINK_ARGS = csrD - -build src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o: c_COMPILER ../src/cdef_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o: c_COMPILER ../src/cdef_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o: c_COMPILER ../src/fg_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o: c_COMPILER ../src/film_grain_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o: c_COMPILER ../src/ipred_prepare_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o: c_COMPILER ../src/ipred_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o: c_COMPILER ../src/itx_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o: c_COMPILER ../src/lf_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o: c_COMPILER ../src/loopfilter_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o: c_COMPILER ../src/looprestoration_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o: c_COMPILER ../src/lr_apply_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o: c_COMPILER ../src/mc_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o: c_COMPILER ../src/recon_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o: c_COMPILER ../src/arm/cdef_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o: c_COMPILER ../src/arm/ipred_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o: c_COMPILER ../src/arm/itx_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o: c_COMPILER ../src/arm/loopfilter_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o: c_COMPILER ../src/arm/looprestoration_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o: c_COMPILER ../src/arm/mc_init_tmpl.c - DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o.d - ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build src/libdav1d_bitdepth_16.a: STATIC_LINKER src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o - LINK_ARGS = csrD - -build src/libdav1d_arch_bitdepth_8.a: STATIC_LINKER - LINK_ARGS = csrD - -build src/libdav1d_arch_bitdepth_16.a: STATIC_LINKER - LINK_ARGS = csrD - -build src/25a6634@@dav1d@sta/cdf.c.o: c_COMPILER ../src/cdf.c - DEPFILE = src/25a6634@@dav1d@sta/cdf.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/cpu.c.o: c_COMPILER ../src/cpu.c - DEPFILE = src/25a6634@@dav1d@sta/cpu.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/data.c.o: c_COMPILER ../src/data.c - DEPFILE = src/25a6634@@dav1d@sta/data.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/decode.c.o: c_COMPILER ../src/decode.c - DEPFILE = src/25a6634@@dav1d@sta/decode.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/dequant_tables.c.o: c_COMPILER ../src/dequant_tables.c - DEPFILE = src/25a6634@@dav1d@sta/dequant_tables.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/getbits.c.o: c_COMPILER ../src/getbits.c - DEPFILE = src/25a6634@@dav1d@sta/getbits.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/intra_edge.c.o: c_COMPILER ../src/intra_edge.c - DEPFILE = src/25a6634@@dav1d@sta/intra_edge.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/lf_mask.c.o: c_COMPILER ../src/lf_mask.c - DEPFILE = src/25a6634@@dav1d@sta/lf_mask.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/log.c.o: c_COMPILER ../src/log.c - DEPFILE = src/25a6634@@dav1d@sta/log.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/msac.c.o: c_COMPILER ../src/msac.c - DEPFILE = src/25a6634@@dav1d@sta/msac.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/obu.c.o: c_COMPILER ../src/obu.c - DEPFILE = src/25a6634@@dav1d@sta/obu.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/picture.c.o: c_COMPILER ../src/picture.c - DEPFILE = src/25a6634@@dav1d@sta/picture.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/qm.c.o: c_COMPILER ../src/qm.c - DEPFILE = src/25a6634@@dav1d@sta/qm.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/ref.c.o: c_COMPILER ../src/ref.c - DEPFILE = src/25a6634@@dav1d@sta/ref.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/ref_mvs.c.o: c_COMPILER ../src/ref_mvs.c - DEPFILE = src/25a6634@@dav1d@sta/ref_mvs.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/scan.c.o: c_COMPILER ../src/scan.c - DEPFILE = src/25a6634@@dav1d@sta/scan.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/tables.c.o: c_COMPILER ../src/tables.c - DEPFILE = src/25a6634@@dav1d@sta/tables.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/warpmv.c.o: c_COMPILER ../src/warpmv.c - DEPFILE = src/25a6634@@dav1d@sta/warpmv.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/wedge.c.o: c_COMPILER ../src/wedge.c - DEPFILE = src/25a6634@@dav1d@sta/wedge.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_cpu.c.o: c_COMPILER ../src/arm/cpu.c - DEPFILE = src/25a6634@@dav1d@sta/arm_cpu.c.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_cdef.S.o: c_COMPILER ../src/arm/64/cdef.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_cdef.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_ipred.S.o: c_COMPILER ../src/arm/64/ipred.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_ipred.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_itx.S.o: c_COMPILER ../src/arm/64/itx.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_itx.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o: c_COMPILER ../src/arm/64/loopfilter.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o: c_COMPILER ../src/arm/64/looprestoration.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_mc.S.o: c_COMPILER ../src/arm/64/mc.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_mc.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/25a6634@@dav1d@sta/arm_64_msac.S.o: c_COMPILER ../src/arm/64/msac.S - DEPFILE = src/25a6634@@dav1d@sta/arm_64_msac.S.o.d - ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread - -build src/libdav1d.a: STATIC_LINKER src/25a6634@@dav1d@sta/cdf.c.o src/25a6634@@dav1d@sta/cpu.c.o src/25a6634@@dav1d@sta/data.c.o src/25a6634@@dav1d@sta/decode.c.o src/25a6634@@dav1d@sta/dequant_tables.c.o src/25a6634@@dav1d@sta/getbits.c.o src/25a6634@@dav1d@sta/intra_edge.c.o src/25a6634@@dav1d@sta/lf_mask.c.o src/25a6634@@dav1d@sta/log.c.o src/25a6634@@dav1d@sta/msac.c.o src/25a6634@@dav1d@sta/obu.c.o src/25a6634@@dav1d@sta/picture.c.o src/25a6634@@dav1d@sta/qm.c.o src/25a6634@@dav1d@sta/ref.c.o src/25a6634@@dav1d@sta/ref_mvs.c.o src/25a6634@@dav1d@sta/scan.c.o src/25a6634@@dav1d@sta/tables.c.o src/25a6634@@dav1d@sta/warpmv.c.o src/25a6634@@dav1d@sta/wedge.c.o src/25a6634@@dav1d@sta/arm_cpu.c.o src/25a6634@@dav1d@sta/arm_64_cdef.S.o src/25a6634@@dav1d@sta/arm_64_ipred.S.o src/25a6634@@dav1d@sta/arm_64_itx.S.o src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o src/25a6634@@dav1d@sta/arm_64_mc.S.o src/25a6634@@dav1d@sta/arm_64_msac.S.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o - LINK_ARGS = csrD - -build tools/f9d35d4@@dav1d_input@sta/input_input.c.o: c_COMPILER ../tools/input/input.c - DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_input.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o: c_COMPILER ../tools/input/annexb.c - DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o: c_COMPILER ../tools/input/ivf.c - DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/libdav1d_input.a: STATIC_LINKER tools/f9d35d4@@dav1d_input@sta/input_input.c.o tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o - LINK_ARGS = csrD - -build tools/f9d35d4@@dav1d_output@sta/output_md5.c.o: c_COMPILER ../tools/output/md5.c - DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_md5.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_output@sta/output_null.c.o: c_COMPILER ../tools/output/null.c - DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_null.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_output@sta/output_output.c.o: c_COMPILER ../tools/output/output.c - DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_output.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o: c_COMPILER ../tools/output/y4m2.c - DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o: c_COMPILER ../tools/output/yuv.c - DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o.d - ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC - -build tools/libdav1d_output.a: STATIC_LINKER tools/f9d35d4@@dav1d_output@sta/output_md5.c.o tools/f9d35d4@@dav1d_output@sta/output_null.c.o tools/f9d35d4@@dav1d_output@sta/output_output.c.o tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o - LINK_ARGS = csrD - -build tools/f9d35d4@@dav1d@exe/dav1d.c.o: c_COMPILER ../tools/dav1d.c || include/vcs_version.h - DEPFILE = tools/f9d35d4@@dav1d@exe/dav1d.c.o.d - ARGS = -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o: c_COMPILER ../tools/dav1d_cli_parse.c || include/vcs_version.h - DEPFILE = tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o.d - ARGS = -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tools/dav1d: c_LINKER tools/f9d35d4@@dav1d@exe/dav1d.c.o tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o | src/libdav1d.a tools/libdav1d_input.a tools/libdav1d_output.a - LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a tools/libdav1d_input.a tools/libdav1d_output.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src:$$ORIGIN/' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o: c_COMPILER ../tests/checkasm/cdef.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o: c_COMPILER ../tests/checkasm/filmgrain.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o: c_COMPILER ../tests/checkasm/ipred.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o: c_COMPILER ../tests/checkasm/itx.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o: c_COMPILER ../tests/checkasm/loopfilter.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o: c_COMPILER ../tests/checkasm/looprestoration.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o: c_COMPILER ../tests/checkasm/mc.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 - -build tests/libcheckasm_bitdepth_8.a: STATIC_LINKER tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o - LINK_ARGS = csrD - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o: c_COMPILER ../tests/checkasm/cdef.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o: c_COMPILER ../tests/checkasm/filmgrain.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o: c_COMPILER ../tests/checkasm/ipred.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o: c_COMPILER ../tests/checkasm/itx.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o: c_COMPILER ../tests/checkasm/loopfilter.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o: c_COMPILER ../tests/checkasm/looprestoration.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o: c_COMPILER ../tests/checkasm/mc.c - DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o.d - ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 - -build tests/libcheckasm_bitdepth_16.a: STATIC_LINKER tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o - LINK_ARGS = csrD - -build tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o: c_COMPILER ../tests/checkasm/checkasm.c - DEPFILE = tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o.d - ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tests/59830eb@@checkasm@exe/checkasm_msac.c.o: c_COMPILER ../tests/checkasm/msac.c - DEPFILE = tests/59830eb@@checkasm@exe/checkasm_msac.c.o.d - ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o: c_COMPILER ../tests/checkasm/arm/checkasm_64.S - DEPFILE = tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o.d - ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tests/checkasm: c_LINKER tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o tests/59830eb@@checkasm@exe/checkasm_msac.c.o tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o src/25a6634@@dav1d@sta/cdf.c.o src/25a6634@@dav1d@sta/cpu.c.o src/25a6634@@dav1d@sta/data.c.o src/25a6634@@dav1d@sta/decode.c.o src/25a6634@@dav1d@sta/dequant_tables.c.o src/25a6634@@dav1d@sta/getbits.c.o src/25a6634@@dav1d@sta/intra_edge.c.o src/25a6634@@dav1d@sta/lf_mask.c.o src/25a6634@@dav1d@sta/log.c.o src/25a6634@@dav1d@sta/msac.c.o src/25a6634@@dav1d@sta/obu.c.o src/25a6634@@dav1d@sta/picture.c.o src/25a6634@@dav1d@sta/qm.c.o src/25a6634@@dav1d@sta/ref.c.o src/25a6634@@dav1d@sta/ref_mvs.c.o src/25a6634@@dav1d@sta/scan.c.o src/25a6634@@dav1d@sta/tables.c.o src/25a6634@@dav1d@sta/warpmv.c.o src/25a6634@@dav1d@sta/wedge.c.o src/25a6634@@dav1d@sta/arm_cpu.c.o src/25a6634@@dav1d@sta/arm_64_cdef.S.o src/25a6634@@dav1d@sta/arm_64_ipred.S.o src/25a6634@@dav1d@sta/arm_64_itx.S.o src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o src/25a6634@@dav1d@sta/arm_64_mc.S.o src/25a6634@@dav1d@sta/arm_64_msac.S.o | /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot/usr/lib/aarch64-linux-android/21/libm.a /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot/usr/lib/aarch64-linux-android/21/libm.so - LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -pthread -Wl,--start-group -lm -Wl,--end-group - -build tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c - DEPFILE = tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c - DEPFILE = tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread - -build tests/dav1d_fuzzer: c_LINKER tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o | src/libdav1d.a - LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src - -build tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c - DEPFILE = tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING - -build tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c - DEPFILE = tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING - -build tests/dav1d_fuzzer_mt: c_LINKER tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o | src/libdav1d.a - LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src - -build tests/libdav1d_af.a: CUSTOM_COMMAND src/libdav1d.a | /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy src/libdav1d.a - COMMAND = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy --redefine-sym malloc=__wrap_malloc --redefine-sym posix_memalign=__wrap_posix_memalign --redefine-sym pthread_create=__wrap_pthread_create --redefine-sym pthread_cond_init=__wrap_pthread_cond_init --redefine-sym pthread_mutex_init=__wrap_pthread_mutex_init src/libdav1d.a tests/libdav1d_af.a - description = Generating$ libdav1d_af$ with$ a$ custom$ command. - -build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c - DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL - -build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c - DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL - -build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o: c_COMPILER ../tests/libfuzzer/alloc_fail.c - DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o.d - ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL - -build tests/dav1d_fuzzer_mem: c_LINKER tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o | tests/libdav1d_af.a - LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libdav1d_af.a -Wl,--end-group -pthread - -# Test rules - -build meson-test: CUSTOM_COMMAND all PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson test --no-rebuild --print-errorlogs - DESC = Running$ all$ tests. - pool = console - -build test: phony meson-test - -build meson-benchmark: CUSTOM_COMMAND all PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson test --benchmark --logbase benchmarklog --num-processes=1 --no-rebuild - DESC = Running$ benchmark$ suite. - pool = console - -build benchmark: phony meson-benchmark - -# Install rules - -build meson-install: CUSTOM_COMMAND PHONY | all - DESC = Installing$ files. - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson install --no-rebuild - pool = console - -build install: phony meson-install - -build meson-dist: CUSTOM_COMMAND PHONY - DESC = Creating$ source$ packages - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson dist - pool = console - -build dist: phony meson-dist - -# Suffix - -build meson-TAGS: CUSTOM_COMMAND PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal tags etags /Users/zlin/workspace/mxcore/media_player/jni/dav1d - pool = console - -build TAGS: phony meson-TAGS - -build meson-ctags: CUSTOM_COMMAND PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal tags ctags /Users/zlin/workspace/mxcore/media_player/jni/dav1d - pool = console - -build ctags: phony meson-ctags - -build meson-uninstall: CUSTOM_COMMAND PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal uninstall - pool = console - -build uninstall: phony meson-uninstall - -build all: phony include/vcs_version.h src/libdav1d.a tools/dav1d tests/dav1d_fuzzer tests/dav1d_fuzzer_mt tests/checkasm - -build clean: phony meson-clean - -build meson-clean-ctlist: CUSTOM_COMMAND PHONY - COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal cleantrees /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/cleantrees.dat - description = Cleaning$ custom$ target$ directories. - -build clean-ctlist: phony meson-clean-ctlist - -build meson-clean: CUSTOM_COMMAND PHONY | clean-ctlist - COMMAND = /Users/zlin/workspace/ninja/ninja -t clean - description = Cleaning. - -build build.ninja: REGENERATE_BUILD ../meson.build ../include/meson.build ../include/dav1d/meson.build ../include/dav1d/version.h.in ../doc/meson.build ../src/meson.build ../tools/meson.build ../examples/meson.build ../tests/meson.build /Users/zlin/workspace/mxcore/media_player/jni/dav1d/cross_file.txt meson-private/coredata.dat ../meson_options.txt - pool = console - -build reconfigure: REGENERATE_BUILD PHONY - pool = console - -build ../meson.build ../include/meson.build ../include/dav1d/meson.build ../include/dav1d/version.h.in ../doc/meson.build ../src/meson.build ../tools/meson.build ../examples/meson.build ../tests/meson.build /Users/zlin/workspace/mxcore/media_player/jni/dav1d/cross_file.txt meson-private/coredata.dat ../meson_options.txt: phony - -default all - diff --git a/ffmpeg/JNI/dav1d/builddir/compile_commands.json b/ffmpeg/JNI/dav1d/builddir/compile_commands.json deleted file mode 100644 index 92b2b9bf8..000000000 --- a/ffmpeg/JNI/dav1d/builddir/compile_commands.json +++ /dev/null @@ -1,608 +0,0 @@ -[ - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o' -MF 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o.d' -o 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o' -c ../src/lib.c", - "file": "../src/lib.c", - "output": "src/25a6634@@dav1d_entrypoint@sta/lib.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o' -MF 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o.d' -o 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o' -c ../src/thread_task.c", - "file": "../src/thread_task.c", - "output": "src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o' -c ../src/cdef_apply_tmpl.c", - "file": "../src/cdef_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o' -c ../src/cdef_tmpl.c", - "file": "../src/cdef_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o' -c ../src/fg_apply_tmpl.c", - "file": "../src/fg_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o' -c ../src/film_grain_tmpl.c", - "file": "../src/film_grain_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o' -c ../src/ipred_prepare_tmpl.c", - "file": "../src/ipred_prepare_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o' -c ../src/ipred_tmpl.c", - "file": "../src/ipred_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o' -c ../src/itx_tmpl.c", - "file": "../src/itx_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o' -c ../src/lf_apply_tmpl.c", - "file": "../src/lf_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o' -c ../src/loopfilter_tmpl.c", - "file": "../src/loopfilter_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o' -c ../src/looprestoration_tmpl.c", - "file": "../src/looprestoration_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o' -c ../src/lr_apply_tmpl.c", - "file": "../src/lr_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o' -c ../src/mc_tmpl.c", - "file": "../src/mc_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o' -c ../src/recon_tmpl.c", - "file": "../src/recon_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o' -c ../src/arm/cdef_init_tmpl.c", - "file": "../src/arm/cdef_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o' -c ../src/arm/ipred_init_tmpl.c", - "file": "../src/arm/ipred_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o' -c ../src/arm/itx_init_tmpl.c", - "file": "../src/arm/itx_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o' -c ../src/arm/loopfilter_init_tmpl.c", - "file": "../src/arm/loopfilter_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o' -c ../src/arm/looprestoration_init_tmpl.c", - "file": "../src/arm/looprestoration_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o' -c ../src/arm/mc_init_tmpl.c", - "file": "../src/arm/mc_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o' -c ../src/cdef_apply_tmpl.c", - "file": "../src/cdef_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o' -c ../src/cdef_tmpl.c", - "file": "../src/cdef_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o' -c ../src/fg_apply_tmpl.c", - "file": "../src/fg_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o' -c ../src/film_grain_tmpl.c", - "file": "../src/film_grain_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o' -c ../src/ipred_prepare_tmpl.c", - "file": "../src/ipred_prepare_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o' -c ../src/ipred_tmpl.c", - "file": "../src/ipred_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o' -c ../src/itx_tmpl.c", - "file": "../src/itx_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o' -c ../src/lf_apply_tmpl.c", - "file": "../src/lf_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o' -c ../src/loopfilter_tmpl.c", - "file": "../src/loopfilter_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o' -c ../src/looprestoration_tmpl.c", - "file": "../src/looprestoration_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o' -c ../src/lr_apply_tmpl.c", - "file": "../src/lr_apply_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o' -c ../src/mc_tmpl.c", - "file": "../src/mc_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o' -c ../src/recon_tmpl.c", - "file": "../src/recon_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o' -c ../src/arm/cdef_init_tmpl.c", - "file": "../src/arm/cdef_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o' -c ../src/arm/ipred_init_tmpl.c", - "file": "../src/arm/ipred_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o' -c ../src/arm/itx_init_tmpl.c", - "file": "../src/arm/itx_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o' -c ../src/arm/loopfilter_init_tmpl.c", - "file": "../src/arm/loopfilter_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o' -c ../src/arm/looprestoration_init_tmpl.c", - "file": "../src/arm/looprestoration_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o' -c ../src/arm/mc_init_tmpl.c", - "file": "../src/arm/mc_init_tmpl.c", - "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/cdf.c.o' -MF 'src/25a6634@@dav1d@sta/cdf.c.o.d' -o 'src/25a6634@@dav1d@sta/cdf.c.o' -c ../src/cdf.c", - "file": "../src/cdf.c", - "output": "src/25a6634@@dav1d@sta/cdf.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/cpu.c.o' -MF 'src/25a6634@@dav1d@sta/cpu.c.o.d' -o 'src/25a6634@@dav1d@sta/cpu.c.o' -c ../src/cpu.c", - "file": "../src/cpu.c", - "output": "src/25a6634@@dav1d@sta/cpu.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/data.c.o' -MF 'src/25a6634@@dav1d@sta/data.c.o.d' -o 'src/25a6634@@dav1d@sta/data.c.o' -c ../src/data.c", - "file": "../src/data.c", - "output": "src/25a6634@@dav1d@sta/data.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/decode.c.o' -MF 'src/25a6634@@dav1d@sta/decode.c.o.d' -o 'src/25a6634@@dav1d@sta/decode.c.o' -c ../src/decode.c", - "file": "../src/decode.c", - "output": "src/25a6634@@dav1d@sta/decode.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/dequant_tables.c.o' -MF 'src/25a6634@@dav1d@sta/dequant_tables.c.o.d' -o 'src/25a6634@@dav1d@sta/dequant_tables.c.o' -c ../src/dequant_tables.c", - "file": "../src/dequant_tables.c", - "output": "src/25a6634@@dav1d@sta/dequant_tables.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/getbits.c.o' -MF 'src/25a6634@@dav1d@sta/getbits.c.o.d' -o 'src/25a6634@@dav1d@sta/getbits.c.o' -c ../src/getbits.c", - "file": "../src/getbits.c", - "output": "src/25a6634@@dav1d@sta/getbits.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/intra_edge.c.o' -MF 'src/25a6634@@dav1d@sta/intra_edge.c.o.d' -o 'src/25a6634@@dav1d@sta/intra_edge.c.o' -c ../src/intra_edge.c", - "file": "../src/intra_edge.c", - "output": "src/25a6634@@dav1d@sta/intra_edge.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/lf_mask.c.o' -MF 'src/25a6634@@dav1d@sta/lf_mask.c.o.d' -o 'src/25a6634@@dav1d@sta/lf_mask.c.o' -c ../src/lf_mask.c", - "file": "../src/lf_mask.c", - "output": "src/25a6634@@dav1d@sta/lf_mask.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/log.c.o' -MF 'src/25a6634@@dav1d@sta/log.c.o.d' -o 'src/25a6634@@dav1d@sta/log.c.o' -c ../src/log.c", - "file": "../src/log.c", - "output": "src/25a6634@@dav1d@sta/log.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/msac.c.o' -MF 'src/25a6634@@dav1d@sta/msac.c.o.d' -o 'src/25a6634@@dav1d@sta/msac.c.o' -c ../src/msac.c", - "file": "../src/msac.c", - "output": "src/25a6634@@dav1d@sta/msac.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/obu.c.o' -MF 'src/25a6634@@dav1d@sta/obu.c.o.d' -o 'src/25a6634@@dav1d@sta/obu.c.o' -c ../src/obu.c", - "file": "../src/obu.c", - "output": "src/25a6634@@dav1d@sta/obu.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/picture.c.o' -MF 'src/25a6634@@dav1d@sta/picture.c.o.d' -o 'src/25a6634@@dav1d@sta/picture.c.o' -c ../src/picture.c", - "file": "../src/picture.c", - "output": "src/25a6634@@dav1d@sta/picture.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/qm.c.o' -MF 'src/25a6634@@dav1d@sta/qm.c.o.d' -o 'src/25a6634@@dav1d@sta/qm.c.o' -c ../src/qm.c", - "file": "../src/qm.c", - "output": "src/25a6634@@dav1d@sta/qm.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/ref.c.o' -MF 'src/25a6634@@dav1d@sta/ref.c.o.d' -o 'src/25a6634@@dav1d@sta/ref.c.o' -c ../src/ref.c", - "file": "../src/ref.c", - "output": "src/25a6634@@dav1d@sta/ref.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/ref_mvs.c.o' -MF 'src/25a6634@@dav1d@sta/ref_mvs.c.o.d' -o 'src/25a6634@@dav1d@sta/ref_mvs.c.o' -c ../src/ref_mvs.c", - "file": "../src/ref_mvs.c", - "output": "src/25a6634@@dav1d@sta/ref_mvs.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/scan.c.o' -MF 'src/25a6634@@dav1d@sta/scan.c.o.d' -o 'src/25a6634@@dav1d@sta/scan.c.o' -c ../src/scan.c", - "file": "../src/scan.c", - "output": "src/25a6634@@dav1d@sta/scan.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/tables.c.o' -MF 'src/25a6634@@dav1d@sta/tables.c.o.d' -o 'src/25a6634@@dav1d@sta/tables.c.o' -c ../src/tables.c", - "file": "../src/tables.c", - "output": "src/25a6634@@dav1d@sta/tables.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/warpmv.c.o' -MF 'src/25a6634@@dav1d@sta/warpmv.c.o.d' -o 'src/25a6634@@dav1d@sta/warpmv.c.o' -c ../src/warpmv.c", - "file": "../src/warpmv.c", - "output": "src/25a6634@@dav1d@sta/warpmv.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/wedge.c.o' -MF 'src/25a6634@@dav1d@sta/wedge.c.o.d' -o 'src/25a6634@@dav1d@sta/wedge.c.o' -c ../src/wedge.c", - "file": "../src/wedge.c", - "output": "src/25a6634@@dav1d@sta/wedge.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_cpu.c.o' -MF 'src/25a6634@@dav1d@sta/arm_cpu.c.o.d' -o 'src/25a6634@@dav1d@sta/arm_cpu.c.o' -c ../src/arm/cpu.c", - "file": "../src/arm/cpu.c", - "output": "src/25a6634@@dav1d@sta/arm_cpu.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o' -c ../src/arm/64/cdef.S", - "file": "../src/arm/64/cdef.S", - "output": "src/25a6634@@dav1d@sta/arm_64_cdef.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o' -c ../src/arm/64/ipred.S", - "file": "../src/arm/64/ipred.S", - "output": "src/25a6634@@dav1d@sta/arm_64_ipred.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_itx.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_itx.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_itx.S.o' -c ../src/arm/64/itx.S", - "file": "../src/arm/64/itx.S", - "output": "src/25a6634@@dav1d@sta/arm_64_itx.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o' -c ../src/arm/64/loopfilter.S", - "file": "../src/arm/64/loopfilter.S", - "output": "src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o' -c ../src/arm/64/looprestoration.S", - "file": "../src/arm/64/looprestoration.S", - "output": "src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_mc.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_mc.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_mc.S.o' -c ../src/arm/64/mc.S", - "file": "../src/arm/64/mc.S", - "output": "src/25a6634@@dav1d@sta/arm_64_mc.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_msac.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_msac.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_msac.S.o' -c ../src/arm/64/msac.S", - "file": "../src/arm/64/msac.S", - "output": "src/25a6634@@dav1d@sta/arm_64_msac.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o' -c ../tools/input/input.c", - "file": "../tools/input/input.c", - "output": "tools/f9d35d4@@dav1d_input@sta/input_input.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o' -c ../tools/input/annexb.c", - "file": "../tools/input/annexb.c", - "output": "tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o' -c ../tools/input/ivf.c", - "file": "../tools/input/ivf.c", - "output": "tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o' -c ../tools/output/md5.c", - "file": "../tools/output/md5.c", - "output": "tools/f9d35d4@@dav1d_output@sta/output_md5.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o' -c ../tools/output/null.c", - "file": "../tools/output/null.c", - "output": "tools/f9d35d4@@dav1d_output@sta/output_null.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o' -c ../tools/output/output.c", - "file": "../tools/output/output.c", - "output": "tools/f9d35d4@@dav1d_output@sta/output_output.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o' -c ../tools/output/y4m2.c", - "file": "../tools/output/y4m2.c", - "output": "tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o' -c ../tools/output/yuv.c", - "file": "../tools/output/yuv.c", - "output": "tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tools/f9d35d4@@dav1d@exe/dav1d.c.o' -MF 'tools/f9d35d4@@dav1d@exe/dav1d.c.o.d' -o 'tools/f9d35d4@@dav1d@exe/dav1d.c.o' -c ../tools/dav1d.c", - "file": "../tools/dav1d.c", - "output": "tools/f9d35d4@@dav1d@exe/dav1d.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o' -MF 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o.d' -o 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o' -c ../tools/dav1d_cli_parse.c", - "file": "../tools/dav1d_cli_parse.c", - "output": "tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o' -c ../tests/checkasm/cdef.c", - "file": "../tests/checkasm/cdef.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o' -c ../tests/checkasm/filmgrain.c", - "file": "../tests/checkasm/filmgrain.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o' -c ../tests/checkasm/ipred.c", - "file": "../tests/checkasm/ipred.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o' -c ../tests/checkasm/itx.c", - "file": "../tests/checkasm/itx.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o' -c ../tests/checkasm/loopfilter.c", - "file": "../tests/checkasm/loopfilter.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o' -c ../tests/checkasm/looprestoration.c", - "file": "../tests/checkasm/looprestoration.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o' -c ../tests/checkasm/mc.c", - "file": "../tests/checkasm/mc.c", - "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o' -c ../tests/checkasm/cdef.c", - "file": "../tests/checkasm/cdef.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o' -c ../tests/checkasm/filmgrain.c", - "file": "../tests/checkasm/filmgrain.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o' -c ../tests/checkasm/ipred.c", - "file": "../tests/checkasm/ipred.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o' -c ../tests/checkasm/itx.c", - "file": "../tests/checkasm/itx.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o' -c ../tests/checkasm/loopfilter.c", - "file": "../tests/checkasm/loopfilter.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o' -c ../tests/checkasm/looprestoration.c", - "file": "../tests/checkasm/looprestoration.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o' -c ../tests/checkasm/mc.c", - "file": "../tests/checkasm/mc.c", - "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o' -c ../tests/checkasm/checkasm.c", - "file": "../tests/checkasm/checkasm.c", - "output": "tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o' -c ../tests/checkasm/msac.c", - "file": "../tests/checkasm/msac.c", - "output": "tests/59830eb@@checkasm@exe/checkasm_msac.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o' -c ../tests/checkasm/arm/checkasm_64.S", - "file": "../tests/checkasm/arm/checkasm_64.S", - "output": "tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c", - "file": "../tests/libfuzzer/dav1d_fuzzer.c", - "output": "tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c", - "file": "../tests/libfuzzer/main.c", - "output": "tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c", - "file": "../tests/libfuzzer/dav1d_fuzzer.c", - "output": "tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c", - "file": "../tests/libfuzzer/main.c", - "output": "tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c", - "file": "../tests/libfuzzer/dav1d_fuzzer.c", - "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c", - "file": "../tests/libfuzzer/main.c", - "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o" - }, - { - "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", - "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o' -c ../tests/libfuzzer/alloc_fail.c", - "file": "../tests/libfuzzer/alloc_fail.c", - "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o" - } -] diff --git a/ffmpeg/JNI/dav1d/builddir/config.h b/ffmpeg/JNI/dav1d/builddir/config.h deleted file mode 100644 index 07f70ca96..000000000 --- a/ffmpeg/JNI/dav1d/builddir/config.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Autogenerated by the Meson build system. - * Do not edit, your changes will be lost. - */ - -#pragma once - -#define ARCH_AARCH64 1 - -#define ARCH_ARM 0 - -#define ARCH_PPC64LE 0 - -#define ARCH_X86 0 - -#define ARCH_X86_32 0 - -#define ARCH_X86_64 0 - -#define CONFIG_16BPC 1 - -#define CONFIG_8BPC 1 - -#define CONFIG_LOG 1 - -#define ENDIANNESS_BIG 0 - -#define HAVE_ASM 1 - -#define HAVE_AS_FUNC 0 - -#define HAVE_GETAUXVAL 1 - -#define HAVE_POSIX_MEMALIGN 1 - -#define HAVE_UNISTD_H 1 - -#define PIC 3 - diff --git a/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h b/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h deleted file mode 100644 index cafd1f1a4..000000000 --- a/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h +++ /dev/null @@ -1,2 +0,0 @@ -/* auto-generated, do not edit */ -#define DAV1D_VERSION "0.4.0" diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json deleted file mode 100644 index 0637a088a..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json +++ /dev/null @@ -1 +0,0 @@ -[] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json deleted file mode 100644 index d4794fd4e..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json +++ /dev/null @@ -1 +0,0 @@ -[{"name": "auto_features", "value": "auto", "section": "core", "machine": "any", "choices": ["enabled", "disabled", "auto"], "type": "combo", "description": "Override value of all 'auto' features"}, {"name": "backend", "value": "ninja", "section": "core", "machine": "any", "choices": ["ninja", "vs", "vs2010", "vs2015", "vs2017", "vs2019", "xcode"], "type": "combo", "description": "Backend to use"}, {"name": "buildtype", "value": "release", "section": "core", "machine": "any", "choices": ["plain", "debug", "debugoptimized", "release", "minsize", "custom"], "type": "combo", "description": "Build type to use"}, {"name": "debug", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Debug"}, {"name": "default_library", "value": "static", "section": "core", "machine": "any", "choices": ["shared", "static", "both"], "type": "combo", "description": "Default library type"}, {"name": "install_umask", "value": 18, "section": "core", "machine": "any", "type": "integer", "description": "Default umask to apply on permissions of installed files"}, {"name": "layout", "value": "mirror", "section": "core", "machine": "any", "choices": ["mirror", "flat"], "type": "combo", "description": "Build directory layout"}, {"name": "optimization", "value": "3", "section": "core", "machine": "any", "choices": ["0", "g", "1", "2", "3", "s"], "type": "combo", "description": "Optimization level"}, {"name": "strip", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Strip targets on install"}, {"name": "unity", "value": "off", "section": "core", "machine": "any", "choices": ["on", "off", "subprojects"], "type": "combo", "description": "Unity build"}, {"name": "warning_level", "value": "2", "section": "core", "machine": "any", "choices": ["0", "1", "2", "3"], "type": "combo", "description": "Compiler warning level to use"}, {"name": "werror", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Treat warnings as errors"}, {"name": "wrap_mode", "value": "default", "section": "core", "machine": "any", "choices": ["default", "nofallback", "nodownload", "forcefallback"], "type": "combo", "description": "Wrap mode"}, {"name": "cmake_prefix_path", "value": [], "section": "core", "machine": "host", "type": "array", "description": "List of additional prefixes for cmake to search"}, {"name": "pkg_config_path", "value": [], "section": "core", "machine": "host", "type": "array", "description": "List of additional paths for pkg-config to search"}, {"name": "build.cmake_prefix_path", "value": [], "section": "core", "machine": "build", "type": "array", "description": "List of additional prefixes for cmake to search"}, {"name": "build.pkg_config_path", "value": [], "section": "core", "machine": "build", "type": "array", "description": "List of additional paths for pkg-config to search"}, {"name": "backend_max_links", "value": 0, "section": "backend", "machine": "any", "type": "integer", "description": "Maximum number of linker processes to run or 0 for no limit"}, {"name": "b_asneeded", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use -Wl,--as-needed when linking"}, {"name": "b_bitcode", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Generate and embed bitcode (only macOS/iOS/tvOS)"}, {"name": "b_colorout", "value": "always", "section": "base", "machine": "any", "choices": ["auto", "always", "never"], "type": "combo", "description": "Use colored output"}, {"name": "b_coverage", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Enable coverage tracking."}, {"name": "b_lto", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Use link time optimization"}, {"name": "b_lundef", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use -Wl,--no-undefined when linking"}, {"name": "b_ndebug", "value": "if-release", "section": "base", "machine": "any", "choices": ["true", "false", "if-release"], "type": "combo", "description": "Disable asserts"}, {"name": "b_pch", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use precompiled headers"}, {"name": "b_pgo", "value": "off", "section": "base", "machine": "any", "choices": ["off", "generate", "use"], "type": "combo", "description": "Use profile guided optimization"}, {"name": "b_pie", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Build executables as position independent"}, {"name": "b_sanitize", "value": "none", "section": "base", "machine": "any", "choices": ["none", "address", "thread", "undefined", "memory", "address,undefined"], "type": "combo", "description": "Code sanitizer to use"}, {"name": "b_staticpic", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Build static libraries as position independent"}, {"name": "c_args", "value": ["-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function"], "section": "compiler", "machine": "host", "type": "array", "description": "Extra arguments passed to the C compiler"}, {"name": "c_link_args", "value": ["-O2", "-march=armv8-a"], "section": "compiler", "machine": "host", "type": "array", "description": "Extra arguments passed to the C linker"}, {"name": "c_std", "value": "c99", "section": "compiler", "machine": "host", "choices": ["none", "c89", "c99", "c11", "c17", "c18", "gnu89", "gnu99", "gnu11", "gnu17", "gnu18"], "type": "combo", "description": "C language standard to use"}, {"name": "build.c_args", "value": [], "section": "compiler", "machine": "build", "type": "array", "description": "Extra arguments passed to the C compiler"}, {"name": "build.c_link_args", "value": [], "section": "compiler", "machine": "build", "type": "array", "description": "Extra arguments passed to the C linker"}, {"name": "build.c_std", "value": "none", "section": "compiler", "machine": "build", "choices": ["none", "c89", "c99", "c11", "c17", "gnu89", "gnu99", "gnu11", "gnu17"], "type": "combo", "description": "C language standard to use"}, {"name": "bindir", "value": "bin", "section": "directory", "machine": "any", "type": "string", "description": "Executable directory"}, {"name": "datadir", "value": "share", "section": "directory", "machine": "any", "type": "string", "description": "Data file directory"}, {"name": "includedir", "value": "include", "section": "directory", "machine": "any", "type": "string", "description": "Header file directory"}, {"name": "infodir", "value": "share/info", "section": "directory", "machine": "any", "type": "string", "description": "Info page directory"}, {"name": "libdir", "value": "lib", "section": "directory", "machine": "any", "type": "string", "description": "Library directory"}, {"name": "libexecdir", "value": "libexec", "section": "directory", "machine": "any", "type": "string", "description": "Library executable directory"}, {"name": "localedir", "value": "share/locale", "section": "directory", "machine": "any", "type": "string", "description": "Locale data directory"}, {"name": "localstatedir", "value": "var", "section": "directory", "machine": "any", "type": "string", "description": "Localstate data directory"}, {"name": "mandir", "value": "share/man", "section": "directory", "machine": "any", "type": "string", "description": "Manual page directory"}, {"name": "prefix", "value": "/usr/local", "section": "directory", "machine": "any", "type": "string", "description": "Installation prefix"}, {"name": "sbindir", "value": "sbin", "section": "directory", "machine": "any", "type": "string", "description": "System executable directory"}, {"name": "sharedstatedir", "value": "com", "section": "directory", "machine": "any", "type": "string", "description": "Architecture-independent data directory"}, {"name": "sysconfdir", "value": "etc", "section": "directory", "machine": "any", "type": "string", "description": "Sysconf data directory"}, {"name": "bitdepths", "value": ["8", "16"], "section": "user", "machine": "any", "type": "array", "description": "Enable only specified bitdepths"}, {"name": "enable_asm", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build asm files, if available"}, {"name": "enable_examples", "value": false, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d examples"}, {"name": "enable_tests", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d tests"}, {"name": "enable_tools", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d cli tools"}, {"name": "fuzzer_ldflags", "value": "", "section": "user", "machine": "any", "type": "string", "description": "Extra LDFLAGS used during linking of fuzzing binaries"}, {"name": "fuzzing_engine", "value": "none", "section": "user", "machine": "any", "choices": ["none", "libfuzzer", "oss-fuzz"], "type": "combo", "description": "Select the fuzzing engine"}, {"name": "logging", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Print error log messages using the provided callback function"}, {"name": "stack_alignment", "value": 0, "section": "user", "machine": "any", "type": "integer", "description": "stack_alignment"}, {"name": "testdata_tests", "value": false, "section": "user", "machine": "any", "type": "boolean", "description": "Run tests requiring the test data repository"}, {"name": "errorlogs", "value": true, "section": "test", "machine": "any", "type": "boolean", "description": "Whether to print the logs from failing tests"}, {"name": "stdsplit", "value": true, "section": "test", "machine": "any", "type": "boolean", "description": "Split stdout and stderr in test logs"}] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json deleted file mode 100644 index 88c4bacbb..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json +++ /dev/null @@ -1 +0,0 @@ -["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/meson_options.txt", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/examples/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/doc/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build"] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json deleted file mode 100644 index 9c1e6b55a..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json +++ /dev/null @@ -1 +0,0 @@ -[{"name": "threads", "compile_args": ["-pthread"], "link_args": ["-pthread"]}] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json deleted file mode 100644 index bd1580bb8..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json +++ /dev/null @@ -1 +0,0 @@ -{"/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d.a": "/usr/local/lib/libdav1d.a", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/dav1d": "/usr/local/bin/dav1d", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/dav1d.pc": "/usr/local/lib/pkgconfig/dav1d.pc", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/common.h": "/usr/local/include/dav1d/common.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/data.h": "/usr/local/include/dav1d/data.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/dav1d.h": "/usr/local/include/dav1d/dav1d.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/headers.h": "/usr/local/include/dav1d/headers.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/picture.h": "/usr/local/include/dav1d/picture.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d/version.h": "/usr/local/include/dav1d/version.h"} \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json deleted file mode 100644 index 77d5b5642..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json +++ /dev/null @@ -1 +0,0 @@ -{"version": "0.4.0", "descriptive_name": "dav1d", "subproject_dir": "subprojects", "subprojects": []} \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json deleted file mode 100644 index f4efb192a..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json +++ /dev/null @@ -1 +0,0 @@ -[{"name": "vcs_version.h", "id": "c0cbff0@@vcs_version.h@cus", "type": "custom", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/vcs_version.h"], "build_by_default": true, "target_sources": [{"language": "unknown", "compiler": ["/Library/Frameworks/Python.framework/Versions/3.8/bin/meson", "--internal", "vcstagger", "@INPUT0@", "@OUTPUT0@", "0.4.0", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "@VCS_TAG@", "(.*)", "/Library/Developer/CommandLineTools/usr/bin/git", "--git-dir", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/.git", "describe", "--tags", "--long", "--match", "?.*.*", "--always"], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/vcs_version.h.in"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_entrypoint", "id": "25a6634@@dav1d_entrypoint@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_entrypoint.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_entrypoint@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lib.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/thread_task.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_bitdepth_8", "id": "25a6634@@dav1d_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_bitdepth_8@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=8"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/fg_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/film_grain_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_prepare_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/itx_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/loopfilter_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/looprestoration_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lr_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/mc_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/recon_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cdef_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/ipred_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/itx_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/loopfilter_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/looprestoration_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/mc_init_tmpl.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_bitdepth_16", "id": "25a6634@@dav1d_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_bitdepth_16@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=16"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/fg_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/film_grain_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_prepare_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/itx_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/loopfilter_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/looprestoration_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lr_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/mc_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/recon_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cdef_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/ipred_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/itx_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/loopfilter_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/looprestoration_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/mc_init_tmpl.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_arch_bitdepth_8", "id": "25a6634@@dav1d_arch_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_arch_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": [], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/config.h"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_arch_bitdepth_16", "id": "25a6634@@dav1d_arch_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_arch_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": [], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/config.h"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d", "id": "25a6634@@dav1d@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d.a"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdf.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cpu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/data.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/decode.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/dequant_tables.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/getbits.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/intra_edge.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_mask.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/log.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/msac.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/obu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/picture.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/qm.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ref.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ref_mvs.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/scan.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/tables.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/warpmv.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/wedge.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cpu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/cdef.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/ipred.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/itx.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/loopfilter.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/looprestoration.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/mc.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/msac.S"], "generated_sources": []}], "subproject": null, "installed": true, "install_filename": ["/usr/local/lib/libdav1d.a"]}, {"name": "dav1d_input", "id": "f9d35d4@@dav1d_input@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/libdav1d_input.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d_input@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/input.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/annexb.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/ivf.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_output", "id": "f9d35d4@@dav1d_output@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/libdav1d_output.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d_output@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/md5.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/null.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/output.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/y4m2.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/yuv.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d", "id": "f9d35d4@@dav1d@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/dav1d"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/dav1d.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/dav1d_cli_parse.c"], "generated_sources": []}], "subproject": null, "installed": true, "install_filename": ["/usr/local/bin/dav1d"]}, {"name": "checkasm_bitdepth_8", "id": "59830eb@@checkasm_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libcheckasm_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm_bitdepth_8@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=8"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/cdef.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/filmgrain.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/ipred.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/itx.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/loopfilter.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/looprestoration.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/mc.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "checkasm_bitdepth_16", "id": "59830eb@@checkasm_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libcheckasm_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm_bitdepth_16@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=16"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/cdef.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/filmgrain.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/ipred.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/itx.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/loopfilter.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/looprestoration.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/mc.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "checkasm", "id": "59830eb@@checkasm@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/checkasm"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/checkasm.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/msac.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/arm/checkasm_64.S"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer", "id": "59830eb@@dav1d_fuzzer@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer_mt", "id": "59830eb@@dav1d_fuzzer_mt@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer_mt"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer_mt@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread", "-DDAV1D_MT_FUZZING"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "libdav1d_af", "id": "59830eb@@libdav1d_af@cus", "type": "custom", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libdav1d_af.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy", "--redefine-sym", "malloc=__wrap_malloc", "--redefine-sym", "posix_memalign=__wrap_posix_memalign", "--redefine-sym", "pthread_create=__wrap_pthread_create", "--redefine-sym", "pthread_cond_init=__wrap_pthread_cond_init", "--redefine-sym", "pthread_mutex_init=__wrap_pthread_mutex_init", "@INPUT@", "@OUTPUT@"], "parameters": [], "sources": [], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer_mem", "id": "59830eb@@dav1d_fuzzer_mem@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer_mem"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer_mem@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread", "-DDAV1D_ALLOC_FAIL"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/alloc_fail.c"], "generated_sources": []}], "subproject": null, "installed": false}] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json deleted file mode 100644 index d2b35d2fa..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json +++ /dev/null @@ -1 +0,0 @@ -[{"cmd": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/checkasm"], "env": {}, "name": "checkasm", "workdir": null, "timeout": 30, "suite": ["dav1d"], "is_parallel": false, "priority": 0}] \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json b/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json deleted file mode 100644 index a66097271..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json +++ /dev/null @@ -1 +0,0 @@ -{"meson_version": {"full": "0.52.999", "major": 0, "minor": 52, "patch": 999}, "directories": {"source": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "build": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "info": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-info"}, "introspection": {"version": {"full": "1.0.0", "major": 1, "minor": 0, "patch": 0}, "information": {"benchmarks": {"file": "intro-benchmarks.json", "updated": true}, "buildoptions": {"file": "intro-buildoptions.json", "updated": true}, "buildsystem_files": {"file": "intro-buildsystem_files.json", "updated": true}, "dependencies": {"file": "intro-dependencies.json", "updated": true}, "installed": {"file": "intro-installed.json", "updated": true}, "projectinfo": {"file": "intro-projectinfo.json", "updated": true}, "targets": {"file": "intro-targets.json", "updated": true}, "tests": {"file": "intro-tests.json", "updated": true}}}, "build_files_updated": true, "error": false} \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt b/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt deleted file mode 100644 index 4b5a3428c..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt +++ /dev/null @@ -1,456 +0,0 @@ -Build started at 2020-03-23T15:35:00.168028 -Main binary: /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -Build Options: -Ddefault_library=static '--cross-file cross_file.txt' -Python system: Darwin -The Meson build system -Version: 0.52.999 -Source dir: /Users/zlin/workspace/mxcore/media_player/jni/dav1d -Build dir: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir -Build type: cross build -Project name: dav1d -Project version: 0.4.0 -No CFLAGS in the environment, not changing global flags. -No LDFLAGS in the environment, not changing global flags. -No CPPFLAGS in the environment, not changing global flags. -Sanity testing C compiler: cc -Is cross compiler: False. -Sanity check compiler command line: cc /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.exe -pipe -Sanity check compile stdout: - ------ -Sanity check compile stderr: - ------ -Running test binary command: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.exe -C compiler for the build machine: cc (clang 10.0.1 "Apple LLVM version 10.0.1 (clang-1001.0.46.4)") -C linker for the build machine: APPLE ld 450.3 -Sanity testing C compiler: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Is cross compiler: True. -Sanity check compiler command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc_cross.exe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -pipe -D_FILE_OFFSET_BITS=64 -c -Sanity check compile stdout: - ------ -Sanity check compile stderr: - ------ -C compiler for the host machine: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang (clang 8.0.7 "Android (5220042 based on r346389c) clang version 8.0.7 (https://android.googlesource.com/toolchain/clang b55f2d4ebfd35bf643d27dbca1bb228957008617) (https://android.googlesource.com/toolchain/llvm 3c393fe7a7e13b0fba4ac75a01aa683d7a5b11cd) (based on LLVM 8.0.7svn)") -C linker for the host machine: GNU ld.bfd 2.27.0.20170315 -Build machine cpu family: x86_64 -Build machine cpu: x86_64 -Host machine cpu family: aarch64 -Host machine cpu: armv8-a -Target machine cpu family: aarch64 -Target machine cpu: armv8-a -Run-time dependency threads found: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 - -Code: - - #include -Compiler stdout: - -Compiler stderr: - -Check usable header "stdatomic.h" : YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 - -Code: - - #include -Compiler stdout: - -Compiler stderr: - -Check usable header "unistd.h" : YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0 -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 - -Code: - - #include -Compiler stdout: - -Compiler stderr: - /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/testfile.c:2:18: fatal error: 'io.h' file not found - #include - ^~~~~~ -1 error generated. - -Check usable header "io.h" : NO -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2 -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_getopt_long || defined __stub___getopt_long - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &getopt_long; - long b = (long) a; - return (int) b; - } -Compiler stdout: - -Compiler stderr: - -Checking for function "getopt_long" : YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_posix_memalign || defined __stub___posix_memalign - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &posix_memalign; - long b = (long) a; - return (int) b; - } -Compiler stdout: - -Compiler stderr: - -Checking for function "posix_memalign" : YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_getauxval || defined __stub___getauxval - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &getauxval; - long b = (long) a; - return (int) b; - } -Compiler stdout: - -Compiler stderr: - -Checking for function "getauxval" : YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_elf_aux_info || defined __stub___elf_aux_info - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &elf_aux_info; - long b = (long) a; - return (int) b; - } -Compiler stdout: - -Compiler stderr: - /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/testfile.c:9:32: error: use of undeclared identifier 'elf_aux_info' - void *a = (void*) &elf_aux_info; - ^ -1 error generated. - -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include - int main() { - #ifdef __has_builtin - #if !__has_builtin(__builtin_elf_aux_info) - #error "__builtin_elf_aux_info not found" - #endif - #elif ! defined(elf_aux_info) - /* Check for __builtin_elf_aux_info only if no includes were added to the - * prefix above, which means no definition of elf_aux_info can be found. - * We would always check for this, but we get false positives on - * MSYS2 if we do. Their toolchain is broken, but we can at least - * give them a workaround. */ - #if 0 - __builtin_elf_aux_info; - #else - #error "No definition for __builtin_elf_aux_info found in the prefix" - #endif - #endif - return 0; - } -Compiler stdout: - -Compiler stderr: - /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/testfile.c:5:18: error: "__builtin_elf_aux_info not found" - #error "__builtin_elf_aux_info not found" - ^ -1 error generated. - -Checking for function "elf_aux_info" : NO -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9 -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -fvisibility=hidden - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -fvisibility=hidden: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8 -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wundef - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Wundef: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Werror=vla - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Werror=vla: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-maybe-uninitialized -Wmaybe-uninitialized - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - error: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Werror,-Wunknown-warning-option] -error: unknown warning option '-Wmaybe-uninitialized'; did you mean '-Wuninitialized'? [-Werror,-Wunknown-warning-option] - -Compiler for C supports arguments -Wno-maybe-uninitialized: NO -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-missing-field-initializers -Wmissing-field-initializers - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Wno-missing-field-initializers: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5 -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-unused-parameter -Wunused-parameter - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Wno-unused-parameter: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Werror=missing-prototypes - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Werror=missing-prototypes: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wshorten-64-to-32 - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -Wshorten-64-to-32: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -fomit-frame-pointer - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -fomit-frame-pointer: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -ffast-math - -Code: - int i; - -Compiler stdout: - -Compiler stderr: - -Compiler for C supports arguments -ffast-math: YES -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 - -Code: - __asm__ ( -".func meson_test" -".endfunc" -); - -Compiler stdout: - -Compiler stderr: - :1:1: error: unknown directive -.func meson_test.endfunc -^ -1 error generated. - -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 - -Code: - -#if defined(PIC) -#error "PIC already defined" -#elif !(defined(__PIC__) || defined(__pic__)) -#error "no pic" -#endif - -Compiler stdout: - -Compiler stderr: - -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -O0 - -Code: - #ifdef __cplusplus - extern "C" { - #endif - void meson_uscore_prefix () {} - #ifdef __cplusplus - } - #endif - -Compiler stdout: - -Compiler stderr: - -Symbols have underscore prefix: NO -Configuring config.h using configuration -Configuring version.h using configuration -Program doxygen found: NO -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_clock_gettime || defined __stub___clock_gettime - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &clock_gettime; - long b = (long) a; - return (int) b; - } -Compiler stdout: - -Compiler stderr: - -Checking for function "clock_gettime" : YES -Configuring cli_config.h using configuration -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Wl,--start-group -lm -Wl,--end-group -Wl,--allow-shlib-undefined -O2 -march=armv8-a - -Code: - int main() { return 0; } -Compiler stdout: - -Compiler stderr: - -Library m found: YES -Adding test "checkasm" -Using cached compile: -Cached command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a - -Code: - #include -#include - - #if defined __stub_posix_memalign || defined __stub___posix_memalign - fail fail fail this function is not going to work - #endif - -int main() { - void *a = (void*) &posix_memalign; - long b = (long) a; - return (int) b; - } -Cached compiler stdout: - -Cached compiler stderr: - -Checking for function "posix_memalign" : YES (cached) -Build targets in project: 17 -Found ninja-1.9.0.git at /Users/zlin/workspace/ninja/ninja -Running compile: -Working directory: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol -Command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 --print-search-dirs - -Code: - -Compiler stdout: - programs: =/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/bin -libraries: =/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/lib64/clang/8.0.7:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/lib64/clang/8.0.7/lib/linux/aarch64:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/lib/../lib64:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib/aarch64-linux-android/21:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib/aarch64-linux-android:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/lib:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib - -Compiler stderr: - diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat deleted file mode 100644 index bacb3788e..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat deleted file mode 100644 index 7ee7736b4..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt b/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt deleted file mode 100644 index 061fe6d4e..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt +++ /dev/null @@ -1,6 +0,0 @@ -[options] -default_library = static - -[properties] -cross_file = ['cross_file.txt'] - diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat deleted file mode 100644 index 9dbbb6bc9..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc b/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc deleted file mode 100644 index 1e76bcefe..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc +++ /dev/null @@ -1,9 +0,0 @@ -prefix=/usr/local -libdir=${prefix}/lib -includedir=${prefix}/include - -Name: libdav1d -Description: AV1 decoding library -Version: 0.4.0 -Libs: -L${libdir} -ldav1d -pthread -Cflags: -I${includedir} -pthread diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat deleted file mode 100644 index 9c6afee32..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson.lock b/ffmpeg/JNI/dav1d/builddir/meson-private/meson.lock deleted file mode 100644 index e69de29bb..000000000 diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat deleted file mode 100644 index 92c3c883e..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat +++ /dev/null @@ -1 +0,0 @@ -€]”. \ No newline at end of file diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat deleted file mode 100644 index f8cf8d627..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c deleted file mode 100644 index 0f968e8aa..000000000 --- a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c +++ /dev/null @@ -1 +0,0 @@ -int main() { int class=0; return class; } diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe deleted file mode 100755 index 2fcafd29f..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe deleted file mode 100644 index e0c2aa1b4..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a b/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a deleted file mode 100644 index e7d8469be..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tests/checkasm b/ffmpeg/JNI/dav1d/builddir/tests/checkasm deleted file mode 100755 index 5a699c54a..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tests/checkasm and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer b/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer deleted file mode 100755 index 590c8143b..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt b/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt deleted file mode 100755 index 59703d26a..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h b/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h deleted file mode 100644 index 21660b629..000000000 --- a/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Autogenerated by the Meson build system. - * Do not edit, your changes will be lost. - */ - -#pragma once - -#define HAVE_CLOCK_GETTIME 1 - diff --git a/ffmpeg/JNI/dav1d/builddir/tools/dav1d b/ffmpeg/JNI/dav1d/builddir/tools/dav1d deleted file mode 100755 index 35c051152..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tools/dav1d and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a b/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a deleted file mode 100644 index d51ef9423..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a b/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a deleted file mode 100644 index 590677be4..000000000 Binary files a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a and /dev/null differ diff --git a/ffmpeg/JNI/dav1d/cross_file.txt b/ffmpeg/JNI/dav1d/cross_file.txt deleted file mode 100644 index 86f1c78f4..000000000 --- a/ffmpeg/JNI/dav1d/cross_file.txt +++ /dev/null @@ -1,16 +0,0 @@ -[binaries] -c = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang' -ar = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-ar' -objcopy = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy' -strip = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-strip' - -[properties] -sys_root = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot' -c_args = ['-fstack-protector','-fstrict-aliasing','-Wno-deprecated-declarations','-Wno-unused-variable','-Wno-unused-function'] -c_link_args =['-O2','-march=armv8-a'] - -[host_machine] -system = 'android' -cpu_family = 'aarch64' -cpu = 'armv8-a' -endian = 'little' diff --git a/ffmpeg/JNI/dav1d/doc/meson.build b/ffmpeg/JNI/dav1d/doc/meson.build index 4badbf6ea..0ef712344 100644 --- a/ffmpeg/JNI/dav1d/doc/meson.build +++ b/ffmpeg/JNI/dav1d/doc/meson.build @@ -27,8 +27,8 @@ dot = find_program('dot', required: false) if doxygen.found() and dot.found() conf_data = configuration_data() - conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d')) - conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include')) + conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d')) + conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include')) conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir()) doxyfile = configure_file(input: 'Doxyfile.in', output: 'Doxyfile', diff --git a/ffmpeg/JNI/dav1d/examples/dav1dplay.c b/ffmpeg/JNI/dav1d/examples/dav1dplay.c index bcd4835b3..d6bb262b5 100644 --- a/ffmpeg/JNI/dav1d/examples/dav1dplay.c +++ b/ffmpeg/JNI/dav1d/examples/dav1dplay.c @@ -29,687 +29,18 @@ #include #include -#include -#include -#include #include -#include "common/attributes.h" - #include "dav1d/dav1d.h" +#include "common/attributes.h" #include "tools/input/input.h" +#include "dp_fifo.h" +#include "dp_renderer.h" -/** - * Settings structure - * Hold all settings available for the player, - * this is usually filled by parsing arguments - * from the console. - */ -typedef struct { - const char *inputfile; - int highquality; - int untimed; - int zerocopy; -} Dav1dPlaySettings; - -#define WINDOW_WIDTH 910 -#define WINDOW_HEIGHT 512 - -#define DAV1D_EVENT_NEW_FRAME 1 -#define DAV1D_EVENT_DEC_QUIT 2 - -/* - * Fifo helper functions - */ -typedef struct dp_fifo -{ - SDL_mutex *lock; - SDL_cond *cond_change; - size_t capacity; - size_t count; - void **entries; -} Dav1dPlayPtrFifo; - -static void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo) -{ - assert(fifo->count == 0); - SDL_DestroyMutex(fifo->lock); - SDL_DestroyCond(fifo->cond_change); - free(fifo->entries); - free(fifo); -} - -static Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity) -{ - Dav1dPlayPtrFifo *fifo; - - assert(capacity > 0); - if (capacity <= 0) - return NULL; - - fifo = malloc(sizeof(*fifo)); - if (fifo == NULL) - return NULL; - - fifo->capacity = capacity; - fifo->count = 0; - - fifo->lock = SDL_CreateMutex(); - if (fifo->lock == NULL) { - free(fifo); - return NULL; - } - fifo->cond_change = SDL_CreateCond(); - if (fifo->cond_change == NULL) { - SDL_DestroyMutex(fifo->lock); - free(fifo); - return NULL; - } - - fifo->entries = calloc(capacity, sizeof(void*)); - if (fifo->entries == NULL) { - dp_fifo_destroy(fifo); - return NULL; - } - - return fifo; -} - -static void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element) -{ - SDL_LockMutex(fifo->lock); - while (fifo->count == fifo->capacity) - SDL_CondWait(fifo->cond_change, fifo->lock); - fifo->entries[fifo->count++] = element; - if (fifo->count == 1) - SDL_CondSignal(fifo->cond_change); - SDL_UnlockMutex(fifo->lock); -} - -static void *dp_fifo_array_shift(void **arr, size_t len) -{ - void *shifted_element = arr[0]; - for (size_t i = 1; i < len; ++i) - arr[i-1] = arr[i]; - return shifted_element; -} - -static void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo) -{ - SDL_LockMutex(fifo->lock); - while (fifo->count == 0) - SDL_CondWait(fifo->cond_change, fifo->lock); - void *res = dp_fifo_array_shift(fifo->entries, fifo->count--); - if (fifo->count == fifo->capacity - 1) - SDL_CondSignal(fifo->cond_change); - SDL_UnlockMutex(fifo->lock); - return res; -} - -/** - * Renderer info - */ -typedef struct rdr_info -{ - // Cookie passed to the renderer implementation callbacks - void *cookie; - // Callback to create the renderer - void* (*create_renderer)(void *data); - // Callback to destroy the renderer - void (*destroy_renderer)(void *cookie); - // Callback to the render function that renders a prevously sent frame - void (*render)(void *cookie, const Dav1dPlaySettings *settings); - // Callback to the send frame function - int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings); - // Callback for alloc/release pictures (optional) - int (*alloc_pic)(Dav1dPicture *pic, void *cookie); - void (*release_pic)(Dav1dPicture *pic, void *cookie); -} Dav1dPlayRenderInfo; - -#ifdef HAVE_PLACEBO_VULKAN - -#include -#include -#include -#include - - -/** - * Renderer context for libplacebo - */ -typedef struct renderer_priv_ctx -{ - // Placebo context - struct pl_context *ctx; - // Placebo renderer - struct pl_renderer *renderer; - // Placebo Vulkan handle - const struct pl_vulkan *vk; - // Placebo Vulkan instance - const struct pl_vk_inst *vk_inst; - // Vulkan surface - VkSurfaceKHR surf; - // Placebo swapchain - const struct pl_swapchain *swapchain; - // Lock protecting access to the texture - SDL_mutex *lock; - // Planes to render - struct pl_plane y_plane; - struct pl_plane u_plane; - struct pl_plane v_plane; - // Textures to render - const struct pl_tex *y_tex; - const struct pl_tex *u_tex; - const struct pl_tex *v_tex; -} Dav1dPlayRendererPrivateContext; - -static void *placebo_renderer_create(void *data) -{ - // Alloc - Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); - if (rd_priv_ctx == NULL) { - return NULL; - } - - // Init libplacebo - rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) { - .log_cb = pl_log_color, -#ifndef NDEBUG - .log_level = PL_LOG_DEBUG, -#else - .log_level = PL_LOG_WARN, -#endif - }); - if (rd_priv_ctx->ctx == NULL) { - free(rd_priv_ctx); - return NULL; - } - - // Create Mutex - rd_priv_ctx->lock = SDL_CreateMutex(); - if (rd_priv_ctx->lock == NULL) { - fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); - pl_context_destroy(&(rd_priv_ctx->ctx)); - free(rd_priv_ctx); - return NULL; - } - - // Init Vulkan - struct pl_vk_inst_params iparams = pl_vk_inst_default_params; - - SDL_Window *sdlwin = data; - - unsigned num = 0; - if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) { - fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError()); - exit(1); - } - - iparams.extensions = malloc(num * sizeof(const char *)); - iparams.num_extensions = num; - assert(iparams.extensions); - - SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, iparams.extensions); - if (!ok) { - fprintf(stderr, "Failed getting Vk instance extensions\n"); - exit(1); - } - - if (num > 0) { - printf("Requesting %d additional Vulkan extensions:\n", num); - for (unsigned i = 0; i < num; i++) - printf(" %s\n", iparams.extensions[i]); - } - - rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams); - if (!rd_priv_ctx->vk_inst) { - fprintf(stderr, "Failed creating Vulkan instance!\n"); - exit(1); - } - free(iparams.extensions); - - if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) { - fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError()); - exit(1); - } - - struct pl_vulkan_params params = pl_vulkan_default_params; - params.instance = rd_priv_ctx->vk_inst->instance; - params.surface = rd_priv_ctx->surf; - params.allow_software = true; - - rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, ¶ms); - if (!rd_priv_ctx->vk) { - fprintf(stderr, "Failed creating vulkan device!\n"); - exit(2); - } - - // Create swapchain - rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk, - &(struct pl_vulkan_swapchain_params) { - .surface = rd_priv_ctx->surf, - .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR, - }); - - if (!rd_priv_ctx->swapchain) { - fprintf(stderr, "Failed creating vulkan swapchain!\n"); - exit(2); - } - - int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; - if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { - fprintf(stderr, "Failed resizing vulkan swapchain!\n"); - exit(2); - } - - if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) - printf("Note: window dimensions differ (got %dx%d)\n", w, h); - - rd_priv_ctx->y_tex = NULL; - rd_priv_ctx->u_tex = NULL; - rd_priv_ctx->v_tex = NULL; - - rd_priv_ctx->renderer = NULL; - - return rd_priv_ctx; -} - -static void placebo_renderer_destroy(void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - pl_renderer_destroy(&(rd_priv_ctx->renderer)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_tex)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_tex)); - pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_tex)); - pl_swapchain_destroy(&(rd_priv_ctx->swapchain)); - pl_vulkan_destroy(&(rd_priv_ctx->vk)); - vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); - pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); - pl_context_destroy(&(rd_priv_ctx->ctx)); -} - -static void placebo_render(void *cookie, const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - if (rd_priv_ctx->y_tex == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - // Prepare rendering - if (rd_priv_ctx->renderer == NULL) { - rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->vk->gpu); - } - - struct pl_swapchain_frame frame; - bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame); - if (!ok) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - const struct pl_tex *img = rd_priv_ctx->y_plane.texture; - struct pl_image image = { - .num_planes = 3, - .planes = { rd_priv_ctx->y_plane, rd_priv_ctx->u_plane, rd_priv_ctx->v_plane }, - .repr = pl_color_repr_hdtv, - .color = pl_color_space_unknown, - .width = img->params.w, - .height = img->params.h, - }; - - struct pl_render_params render_params = {0}; - if (settings->highquality) - render_params = pl_render_default_params; - - struct pl_render_target target; - pl_render_target_from_swapchain(&target, &frame); - target.profile = (struct pl_icc_profile) { - .data = NULL, - .len = 0, - }; - - if (!pl_render_image(rd_priv_ctx->renderer, &image, &target, &render_params)) { - fprintf(stderr, "Failed rendering frame!\n"); - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); - if (!ok) { - fprintf(stderr, "Failed submitting frame!\n"); - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - pl_swapchain_swap_buffers(rd_priv_ctx->swapchain); - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (dav1d_pic == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; - } - - int width = dav1d_pic->p.w; - int height = dav1d_pic->p.h; - - enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; - - if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { - fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); - exit(50); - } - - struct pl_plane_data data_y = { - .type = PL_FMT_UNORM, - .width = width, - .height = height, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[0], - .component_size = {8}, - .component_map = {0}, - }; - - struct pl_plane_data data_u = { - .type = PL_FMT_UNORM, - .width = width/2, - .height = height/2, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[1], - .component_size = {8}, - .component_map = {1}, - }; - - struct pl_plane_data data_v = { - .type = PL_FMT_UNORM, - .width = width/2, - .height = height/2, - .pixel_stride = 1, - .row_stride = dav1d_pic->stride[1], - .component_size = {8}, - .component_map = {2}, - }; - - if (settings->zerocopy) { - const struct pl_buf *buf = dav1d_pic->allocator_data; - assert(buf); - data_y.buf = data_u.buf = data_v.buf = buf; - data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data; - data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data; - data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data; - } else { - data_y.pixels = dav1d_pic->data[0]; - data_u.pixels = dav1d_pic->data[1]; - data_v.pixels = dav1d_pic->data[2]; - } - - bool ok = true; - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y); - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u); - ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_plane), &(rd_priv_ctx->v_tex), &data_v); - - pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->u_plane.shift_x, &rd_priv_ctx->u_plane.shift_y); - pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->v_plane.shift_x, &rd_priv_ctx->v_plane.shift_y); - - if (!ok) { - fprintf(stderr, "Failed uploading planes!\n"); - } - - SDL_UnlockMutex(rd_priv_ctx->lock); - return !ok; -} - -// Align to power of 2 -#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) - -static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - SDL_LockMutex(rd_priv_ctx->lock); - - const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu; - int ret = DAV1D_ERR(ENOMEM); - - // Copied from dav1d_default_picture_alloc - const int hbd = p->p.bpc > 8; - const int aligned_w = ALIGN2(p->p.w, 128); - const int aligned_h = ALIGN2(p->p.h, 128); - const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; - const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; - const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; - p->stride[0] = aligned_w << hbd; - p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; - - // Align strides up to multiples of the GPU performance hints - p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride); - p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride); - - // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2) - size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4); - const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align); - const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); - - // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, - // even in the case that the driver gives us insane alignments - const size_t pic_size = y_sz + 2 * uv_sz; - const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; - - // Validate size limitations - if (total_size > gpu->limits.max_xfer_size) { - printf("alloc of %zu bytes exceeds limits\n", total_size); - goto err; - } - - const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) { - .type = PL_BUF_TEX_TRANSFER, - .host_mapped = true, - .size = total_size, - .memory_type = PL_BUF_MEM_HOST, - .user_data = p, - }); - - if (!buf) { - printf("alloc of GPU mapped buffer failed\n"); - goto err; - } - - assert(buf->data); - uintptr_t base = (uintptr_t) buf->data, data[3]; - data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); - data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); - data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); - - // Sanity check offset alignment for the sake of debugging - if (data[0] - base != ALIGN2(data[0] - base, off_align) || - data[1] - base != ALIGN2(data[1] - base, off_align) || - data[2] - base != ALIGN2(data[2] - base, off_align)) - { - printf("GPU buffer horribly misaligned, expect slowdown!\n"); - } - - p->allocator_data = (void *) buf; - p->data[0] = (void *) data[0]; - p->data[1] = (void *) data[1]; - p->data[2] = (void *) data[2]; - ret = 0; - - // fall through -err: - SDL_UnlockMutex(rd_priv_ctx->lock); - return ret; -} - -static void placebo_release_pic(Dav1dPicture *pic, void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - assert(pic->allocator_data); - - SDL_LockMutex(rd_priv_ctx->lock); - const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu; - pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data); - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static const Dav1dPlayRenderInfo renderer_info = { - .create_renderer = placebo_renderer_create, - .destroy_renderer = placebo_renderer_destroy, - .render = placebo_render, - .update_frame = placebo_upload_planes, - .alloc_pic = placebo_alloc_pic, - .release_pic = placebo_release_pic, -}; - -#else - -/** - * Renderer context for SDL - */ -typedef struct renderer_priv_ctx -{ - // SDL renderer - SDL_Renderer *renderer; - // Lock protecting access to the texture - SDL_mutex *lock; - // Texture to render - SDL_Texture *tex; -} Dav1dPlayRendererPrivateContext; - -static void *sdl_renderer_create(void *data) -{ - SDL_Window *win = data; - - // Alloc - Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); - if (rd_priv_ctx == NULL) { - return NULL; - } - - // Create renderer - rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED); - // Set scale quality - SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear"); - - // Create Mutex - rd_priv_ctx->lock = SDL_CreateMutex(); - if (rd_priv_ctx->lock == NULL) { - fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); - free(rd_priv_ctx); - return NULL; - } - - rd_priv_ctx->tex = NULL; - - return rd_priv_ctx; -} - -static void sdl_renderer_destroy(void *cookie) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_DestroyRenderer(rd_priv_ctx->renderer); - SDL_DestroyMutex(rd_priv_ctx->lock); - free(rd_priv_ctx); -} - -static void sdl_render(void *cookie, const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (rd_priv_ctx->tex == NULL) { - SDL_UnlockMutex(rd_priv_ctx->lock); - return; - } - - // Display the frame - SDL_RenderClear(rd_priv_ctx->renderer); - SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL); - SDL_RenderPresent(rd_priv_ctx->renderer); - - SDL_UnlockMutex(rd_priv_ctx->lock); -} - -static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic, - const Dav1dPlaySettings *settings) -{ - Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; - assert(rd_priv_ctx != NULL); - - SDL_LockMutex(rd_priv_ctx->lock); - - if (dav1d_pic == NULL) { - rd_priv_ctx->tex = NULL; - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; - } - - int width = dav1d_pic->p.w; - int height = dav1d_pic->p.h; - int tex_w = width; - int tex_h = height; - - enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; - - if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { - fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); - exit(50); - } - - SDL_Texture *texture = rd_priv_ctx->tex; - if (texture != NULL) { - SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h); - if (tex_w != width || tex_h != height) { - SDL_DestroyTexture(texture); - texture = NULL; - } - } - - if (texture == NULL) { - texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, - SDL_TEXTUREACCESS_STREAMING, width, height); - } - - SDL_UpdateYUVTexture(texture, NULL, - dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y - dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U - dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V - ); - - rd_priv_ctx->tex = texture; - SDL_UnlockMutex(rd_priv_ctx->lock); - return 0; -} - -static const Dav1dPlayRenderInfo renderer_info = { - .create_renderer = sdl_renderer_create, - .destroy_renderer = sdl_renderer_destroy, - .render = sdl_render, - .update_frame = sdl_update_texture -}; - -#endif +// Selected renderer callbacks and cookie +static const Dav1dPlayRenderInfo *renderer_info = { NULL }; /** * Render context structure @@ -722,8 +53,6 @@ typedef struct render_context Dav1dPlaySettings settings; Dav1dSettings lib_settings; - // Renderer callbacks - Dav1dPlayRenderInfo *renderer_info; // Renderer private data (passed to callbacks) void *rd_priv; @@ -768,7 +97,9 @@ static void dp_settings_print_usage(const char *const app, " --tilethreads $num: number of tile threads (default: 1)\n" " --highquality: enable high quality rendering\n" " --zerocopy/-z: enable zero copy upload path\n" - " --version/-v: print version and exit\n"); + " --gpugrain/-g: enable GPU grain synthesis\n" + " --version/-v: print version and exit\n" + " --renderer/-r: select renderer backend (default: auto)\n"); exit(1); } @@ -791,7 +122,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, Dav1dSettings *lib_settings = &rd_ctx->lib_settings; // Short options - static const char short_opts[] = "i:vuz"; + static const char short_opts[] = "i:vuzgr:"; enum { ARG_FRAME_THREADS = 256, @@ -808,6 +139,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, { "tilethreads", 1, NULL, ARG_TILE_THREADS }, { "highquality", 0, NULL, ARG_HIGH_QUALITY }, { "zerocopy", 0, NULL, 'z' }, + { "gpugrain", 0, NULL, 'g' }, + { "renderer", 0, NULL, 'r'}, { NULL, 0, NULL, 0 }, }; @@ -824,15 +157,15 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, break; case ARG_HIGH_QUALITY: settings->highquality = true; -#ifndef HAVE_PLACEBO_VULKAN - fprintf(stderr, "warning: --highquality requires libplacebo\n"); -#endif break; case 'z': settings->zerocopy = true; -#ifndef HAVE_PLACEBO_VULKAN - fprintf(stderr, "warning: --zerocopy requires libplacebo\n"); -#endif + break; + case 'g': + settings->gpugrain = true; + break; + case 'r': + settings->renderer_name = optarg; break; case ARG_FRAME_THREADS: lib_settings->n_frame_threads = @@ -852,6 +185,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx, "Extra/unused arguments found, e.g. '%s'\n", argv[optind]); if (!settings->inputfile) dp_settings_print_usage(argv[0], "Input file (-i/--input) is required"); + if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0) + settings->renderer_name = NULL; } /** @@ -861,7 +196,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx) { assert(rd_ctx != NULL); - renderer_info.destroy_renderer(rd_ctx->rd_priv); + renderer_info->destroy_renderer(rd_ctx->rd_priv); dp_fifo_destroy(rd_ctx->fifo); SDL_DestroyMutex(rd_ctx->lock); free(rd_ctx); @@ -873,7 +208,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx) * \note The Dav1dPlayRenderContext must be destroyed * again by using dp_rd_ctx_destroy. */ -static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) +static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv) { Dav1dPlayRenderContext *rd_ctx; @@ -907,7 +242,22 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) return NULL; } - rd_ctx->rd_priv = renderer_info.create_renderer(rd_data); + // Parse and validate arguments + dav1d_default_settings(&rd_ctx->lib_settings); + memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); + dp_rd_ctx_parse_args(rd_ctx, argc, argv); + + // Select renderer + renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name); + + if (renderer_info == NULL) { + printf("No suitable rendered matching %s found.\n", + (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto"); + } else { + printf("Using %s renderer\n", renderer_info->name); + } + + rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL; if (rd_ctx->rd_priv == NULL) { SDL_DestroyMutex(rd_ctx->lock); dp_fifo_destroy(rd_ctx->fifo); @@ -915,9 +265,6 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data) return NULL; } - dav1d_default_settings(&rd_ctx->lib_settings); - memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings)); - rd_ctx->last_pts = 0; rd_ctx->last_ticks = 0; rd_ctx->current_pts = 0; @@ -949,7 +296,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code) static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx, Dav1dPicture *dav1d_pic) { - renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings); + renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings); rd_ctx->current_pts = dav1d_pic->m.timestamp; } @@ -1004,7 +351,7 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx) fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000); } - renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings); + renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings); rd_ctx->last_ticks = SDL_GetTicks(); } @@ -1152,7 +499,6 @@ static int decoder_thread_main(void *cookie) int main(int argc, char **argv) { SDL_Thread *decoder_thread; - SDL_Window *win = NULL; // Check for version mismatch between library and tool const char *version = dav1d_version(); @@ -1166,34 +512,30 @@ int main(int argc, char **argv) if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0) return 10; - // Create Window and Renderer - int window_flags = SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI; -#ifdef HAVE_PLACEBO_VULKAN - window_flags |= SDL_WINDOW_VULKAN; -#endif - win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, - WINDOW_WIDTH, WINDOW_HEIGHT, window_flags); - SDL_SetWindowResizable(win, SDL_TRUE); - // Create render context - Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(win); + Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv); if (rd_ctx == NULL) { fprintf(stderr, "Failed creating render context\n"); return 5; } - // Parse and validate arguments - dp_rd_ctx_parse_args(rd_ctx, argc, argv); - if (rd_ctx->settings.zerocopy) { - if (renderer_info.alloc_pic) { + if (renderer_info->alloc_pic) { rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) { .cookie = rd_ctx->rd_priv, - .alloc_picture_callback = renderer_info.alloc_pic, - .release_picture_callback = renderer_info.release_pic, + .alloc_picture_callback = renderer_info->alloc_pic, + .release_picture_callback = renderer_info->release_pic, }; } else { - fprintf(stderr, "--zerocopy unsupported by compiled renderer\n"); + fprintf(stderr, "--zerocopy unsupported by selected renderer\n"); + } + } + + if (rd_ctx->settings.gpugrain) { + if (renderer_info->supports_gpu_grain) { + rd_ctx->lib_settings.apply_grain = 0; + } else { + fprintf(stderr, "--gpugrain unsupported by selected renderer\n"); } } @@ -1207,6 +549,10 @@ int main(int argc, char **argv) if (SDL_WaitEvent(&e)) { if (e.type == SDL_QUIT) { dp_rd_ctx_request_shutdown(rd_ctx); + } else if (e.type == SDL_WINDOWEVENT) { + if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) { + // TODO: Handle window resizes + } } else if (e.type == rd_ctx->renderer_event_type) { if (e.user.code == DAV1D_EVENT_NEW_FRAME) { // Dequeue frame and update the render context with it @@ -1232,7 +578,6 @@ int main(int argc, char **argv) SDL_WaitThread(decoder_thread, &decoder_ret); dp_rd_ctx_destroy(rd_ctx); - SDL_DestroyWindow(win); return decoder_ret; } diff --git a/ffmpeg/JNI/dav1d/examples/dp_fifo.c b/ffmpeg/JNI/dav1d/examples/dp_fifo.c new file mode 100644 index 000000000..243d2e933 --- /dev/null +++ b/ffmpeg/JNI/dav1d/examples/dp_fifo.c @@ -0,0 +1,123 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "dp_fifo.h" + +// FIFO structure +struct dp_fifo +{ + SDL_mutex *lock; + SDL_cond *cond_change; + size_t capacity; + size_t count; + void **entries; +}; + + +Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity) +{ + Dav1dPlayPtrFifo *fifo; + + assert(capacity > 0); + if (capacity <= 0) + return NULL; + + fifo = malloc(sizeof(*fifo)); + if (fifo == NULL) + return NULL; + + fifo->capacity = capacity; + fifo->count = 0; + + fifo->lock = SDL_CreateMutex(); + if (fifo->lock == NULL) { + free(fifo); + return NULL; + } + fifo->cond_change = SDL_CreateCond(); + if (fifo->cond_change == NULL) { + SDL_DestroyMutex(fifo->lock); + free(fifo); + return NULL; + } + + fifo->entries = calloc(capacity, sizeof(void*)); + if (fifo->entries == NULL) { + dp_fifo_destroy(fifo); + return NULL; + } + + return fifo; +} + +// Destroy FIFO +void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo) +{ + assert(fifo->count == 0); + SDL_DestroyMutex(fifo->lock); + SDL_DestroyCond(fifo->cond_change); + free(fifo->entries); + free(fifo); +} + +// Push to FIFO +void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element) +{ + SDL_LockMutex(fifo->lock); + while (fifo->count == fifo->capacity) + SDL_CondWait(fifo->cond_change, fifo->lock); + fifo->entries[fifo->count++] = element; + if (fifo->count == 1) + SDL_CondSignal(fifo->cond_change); + SDL_UnlockMutex(fifo->lock); +} + +// Helper that shifts the FIFO array +static void *dp_fifo_array_shift(void **arr, size_t len) +{ + void *shifted_element = arr[0]; + for (size_t i = 1; i < len; ++i) + arr[i-1] = arr[i]; + return shifted_element; +} + +// Get item from FIFO +void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo) +{ + SDL_LockMutex(fifo->lock); + while (fifo->count == 0) + SDL_CondWait(fifo->cond_change, fifo->lock); + void *res = dp_fifo_array_shift(fifo->entries, fifo->count--); + if (fifo->count == fifo->capacity - 1) + SDL_CondSignal(fifo->cond_change); + SDL_UnlockMutex(fifo->lock); + return res; +} + + diff --git a/ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h b/ffmpeg/JNI/dav1d/examples/dp_fifo.h similarity index 59% rename from ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h rename to ffmpeg/JNI/dav1d/examples/dp_fifo.h index 3caccad31..a94b089b2 100644 --- a/ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h +++ b/ffmpeg/JNI/dav1d/examples/dp_fifo.h @@ -24,11 +24,38 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef DAV1D_VERSION_H -#define DAV1D_VERSION_H +/* + * Dav1dPlay FIFO helper + */ -#define DAV1D_API_VERSION_MAJOR 3 -#define DAV1D_API_VERSION_MINOR 0 -#define DAV1D_API_VERSION_PATCH 0 +typedef struct dp_fifo Dav1dPlayPtrFifo; -#endif /* DAV1D_VERSION_H */ +/* Create a FIFO + * + * Creates a FIFO with the given capacity. + * If the capacity is reached, new inserts into the FIFO + * will block until enough space is available again. + */ +Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity); + +/* Destroy a FIFO + * + * The FIFO must be empty before it is destroyed! + */ +void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo); + +/* Shift FIFO + * + * Return the first item from the FIFO, thereby removing it from + * the FIFO and making room for new entries. + */ +void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo); + +/* Push to FIFO + * + * Add an item to the end of the FIFO. + * If the FIFO is full, this call will block until there is again enough + * space in the FIFO, so calling this from the "consumer" thread if no + * other thread will call dp_fifo_shift will lead to a deadlock. + */ +void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element); diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer.h b/ffmpeg/JNI/dav1d/examples/dp_renderer.h new file mode 100644 index 000000000..4c6f2954f --- /dev/null +++ b/ffmpeg/JNI/dav1d/examples/dp_renderer.h @@ -0,0 +1,132 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "dav1d/dav1d.h" + +#include +#ifdef HAVE_PLACEBO +# include +#endif + +// Check libplacebo Vulkan rendering +#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN) +# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN +# define HAVE_RENDERER_PLACEBO +# define HAVE_PLACEBO_VULKAN +# endif +#endif + +// Check libplacebo OpenGL rendering +#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL +# define HAVE_RENDERER_PLACEBO +# define HAVE_PLACEBO_OPENGL +#endif + +/** + * Settings structure + * Hold all settings available for the player, + * this is usually filled by parsing arguments + * from the console. + */ +typedef struct { + const char *inputfile; + const char *renderer_name; + int highquality; + int untimed; + int zerocopy; + int gpugrain; +} Dav1dPlaySettings; + +#define WINDOW_WIDTH 910 +#define WINDOW_HEIGHT 512 + +#define DAV1D_EVENT_NEW_FRAME 1 +#define DAV1D_EVENT_DEC_QUIT 2 + +/** + * Renderer info + */ +typedef struct rdr_info +{ + // Renderer name + const char *name; + // Cookie passed to the renderer implementation callbacks + void *cookie; + // Callback to create the renderer + void* (*create_renderer)(); + // Callback to destroy the renderer + void (*destroy_renderer)(void *cookie); + // Callback to the render function that renders a prevously sent frame + void (*render)(void *cookie, const Dav1dPlaySettings *settings); + // Callback to the send frame function + int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings); + // Callback for alloc/release pictures (optional) + int (*alloc_pic)(Dav1dPicture *pic, void *cookie); + void (*release_pic)(Dav1dPicture *pic, void *cookie); + // Whether or not this renderer can apply on-GPU film grain synthesis + int supports_gpu_grain; +} Dav1dPlayRenderInfo; + +extern const Dav1dPlayRenderInfo rdr_placebo_vk; +extern const Dav1dPlayRenderInfo rdr_placebo_gl; +extern const Dav1dPlayRenderInfo rdr_sdl; + +// Available renderes ordered by priority +static const Dav1dPlayRenderInfo* const dp_renderers[] = { + &rdr_placebo_vk, + &rdr_placebo_gl, + &rdr_sdl, +}; + +static inline const Dav1dPlayRenderInfo *dp_get_renderer(const char *name) +{ + for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i) + { + if (dp_renderers[i]->name == NULL) + continue; + + if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) { + return dp_renderers[i]; + } + } + return NULL; +} + +static inline SDL_Window *dp_create_sdl_window(int window_flags) +{ + SDL_Window *win; + window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI; + + win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, + WINDOW_WIDTH, WINDOW_HEIGHT, window_flags); + SDL_SetWindowResizable(win, SDL_TRUE); + + return win; +} diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c b/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c new file mode 100644 index 000000000..beb1d42ad --- /dev/null +++ b/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c @@ -0,0 +1,723 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "dp_renderer.h" + +#ifdef HAVE_RENDERER_PLACEBO +#include + +#include +#include + +#ifdef HAVE_PLACEBO_VULKAN +# include +# include +#endif +#ifdef HAVE_PLACEBO_OPENGL +# include +# include +#endif + + +/** + * Renderer context for libplacebo + */ +typedef struct renderer_priv_ctx +{ + // SDL window + SDL_Window *win; + // Placebo context + struct pl_context *ctx; + // Placebo renderer + struct pl_renderer *renderer; +#ifdef HAVE_PLACEBO_VULKAN + // Placebo Vulkan handle + const struct pl_vulkan *vk; + // Placebo Vulkan instance + const struct pl_vk_inst *vk_inst; + // Vulkan surface + VkSurfaceKHR surf; +#endif +#ifdef HAVE_PLACEBO_OPENGL + // Placebo OpenGL handle + const struct pl_opengl *gl; +#endif + // Placebo GPU + const struct pl_gpu *gpu; + // Placebo swapchain + const struct pl_swapchain *swapchain; + // Lock protecting access to the texture + SDL_mutex *lock; + // Image to render, and planes backing them + struct pl_image image; + const struct pl_tex *plane_tex[3]; +} Dav1dPlayRendererPrivateContext; + +static Dav1dPlayRendererPrivateContext* + placebo_renderer_create_common(int window_flags) +{ + // Create Window + SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE); + if (sdlwin == NULL) + return NULL; + + // Alloc + Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); + if (rd_priv_ctx == NULL) { + return NULL; + } + + *rd_priv_ctx = (Dav1dPlayRendererPrivateContext) {0}; + rd_priv_ctx->win = sdlwin; + + // Init libplacebo + rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) { + .log_cb = pl_log_color, +#ifndef NDEBUG + .log_level = PL_LOG_DEBUG, +#else + .log_level = PL_LOG_WARN, +#endif + }); + if (rd_priv_ctx->ctx == NULL) { + free(rd_priv_ctx); + return NULL; + } + + // Create Mutex + rd_priv_ctx->lock = SDL_CreateMutex(); + if (rd_priv_ctx->lock == NULL) { + fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); + pl_context_destroy(&(rd_priv_ctx->ctx)); + free(rd_priv_ctx); + return NULL; + } + + return rd_priv_ctx; +} + +#ifdef HAVE_PLACEBO_OPENGL +static void *placebo_renderer_create_gl() +{ + SDL_Window *sdlwin = NULL; + SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG); + + // Common init + Dav1dPlayRendererPrivateContext *rd_priv_ctx = + placebo_renderer_create_common(SDL_WINDOW_OPENGL); + + if (rd_priv_ctx == NULL) + return NULL; + sdlwin = rd_priv_ctx->win; + + // Init OpenGL + struct pl_opengl_params params = pl_opengl_default_params; +# ifndef NDEBUG + params.debug = true; +# endif + + SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin); + SDL_GL_MakeCurrent(sdlwin, glcontext); + + rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->ctx, ¶ms); + if (!rd_priv_ctx->gl) { + fprintf(stderr, "Failed creating opengl device!\n"); + exit(2); + } + + rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl, + &(struct pl_opengl_swapchain_params) { + .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow, + .priv = sdlwin, + }); + + if (!rd_priv_ctx->swapchain) { + fprintf(stderr, "Failed creating opengl swapchain!\n"); + exit(2); + } + + int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; + SDL_GL_GetDrawableSize(sdlwin, &w, &h); + + if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { + fprintf(stderr, "Failed resizing vulkan swapchain!\n"); + exit(2); + } + + rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu; + + if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) + printf("Note: window dimensions differ (got %dx%d)\n", w, h); + + return rd_priv_ctx; +} +#endif + +#ifdef HAVE_PLACEBO_VULKAN +static void *placebo_renderer_create_vk() +{ + SDL_Window *sdlwin = NULL; + + // Common init + Dav1dPlayRendererPrivateContext *rd_priv_ctx = + placebo_renderer_create_common(SDL_WINDOW_VULKAN); + + if (rd_priv_ctx == NULL) + return NULL; + sdlwin = rd_priv_ctx->win; + + // Init Vulkan + unsigned num = 0; + if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) { + fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError()); + exit(1); + } + + const char **extensions = malloc(num * sizeof(const char *)); + assert(extensions); + + SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions); + if (!ok) { + fprintf(stderr, "Failed getting Vk instance extensions\n"); + exit(1); + } + + if (num > 0) { + printf("Requesting %d additional Vulkan extensions:\n", num); + for (unsigned i = 0; i < num; i++) + printf(" %s\n", extensions[i]); + } + + struct pl_vk_inst_params iparams = pl_vk_inst_default_params; + iparams.extensions = extensions; + iparams.num_extensions = num; + + rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams); + if (!rd_priv_ctx->vk_inst) { + fprintf(stderr, "Failed creating Vulkan instance!\n"); + exit(1); + } + free(extensions); + + if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) { + fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError()); + exit(1); + } + + struct pl_vulkan_params params = pl_vulkan_default_params; + params.instance = rd_priv_ctx->vk_inst->instance; + params.surface = rd_priv_ctx->surf; + params.allow_software = true; + + rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, ¶ms); + if (!rd_priv_ctx->vk) { + fprintf(stderr, "Failed creating vulkan device!\n"); + exit(2); + } + + // Create swapchain + rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk, + &(struct pl_vulkan_swapchain_params) { + .surface = rd_priv_ctx->surf, + .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR, + }); + + if (!rd_priv_ctx->swapchain) { + fprintf(stderr, "Failed creating vulkan swapchain!\n"); + exit(2); + } + + int w = WINDOW_WIDTH, h = WINDOW_HEIGHT; + if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) { + fprintf(stderr, "Failed resizing vulkan swapchain!\n"); + exit(2); + } + + rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu; + + if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT) + printf("Note: window dimensions differ (got %dx%d)\n", w, h); + + return rd_priv_ctx; +} +#endif + +static void placebo_renderer_destroy(void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + pl_renderer_destroy(&(rd_priv_ctx->renderer)); + pl_swapchain_destroy(&(rd_priv_ctx->swapchain)); + for (int i = 0; i < 3; i++) + pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i])); + +#ifdef HAVE_PLACEBO_VULKAN + if (rd_priv_ctx->vk) { + pl_vulkan_destroy(&(rd_priv_ctx->vk)); + vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL); + pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst)); + } +#endif +#ifdef HAVE_PLACEBO_OPENGL + if (rd_priv_ctx->gl) + pl_opengl_destroy(&(rd_priv_ctx->gl)); +#endif + + SDL_DestroyWindow(rd_priv_ctx->win); + + pl_context_destroy(&(rd_priv_ctx->ctx)); +} + +static void placebo_render(void *cookie, const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + if (!rd_priv_ctx->image.num_planes) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + // Prepare rendering + if (rd_priv_ctx->renderer == NULL) { + rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->gpu); + } + + struct pl_swapchain_frame frame; + bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame); + if (!ok) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + struct pl_render_params render_params = {0}; + if (settings->highquality) + render_params = pl_render_default_params; + + struct pl_render_target target; + pl_render_target_from_swapchain(&target, &frame); + target.profile = (struct pl_icc_profile) { + .data = NULL, + .len = 0, + }; + +#if PL_API_VER >= 66 + pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0); + if (pl_render_target_partial(&target)) + pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 }); +#endif + + if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) { + fprintf(stderr, "Failed rendering frame!\n"); + pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 }); + } + + ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain); + if (!ok) { + fprintf(stderr, "Failed submitting frame!\n"); + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + pl_swapchain_swap_buffers(rd_priv_ctx->swapchain); + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (dav1d_pic == NULL) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; + } + + int width = dav1d_pic->p.w; + int height = dav1d_pic->p.h; + int sub_x = 0, sub_y = 0; + int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up + enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN; + + struct pl_image *image = &rd_priv_ctx->image; + *image = (struct pl_image) { + .num_planes = 3, + .width = width, + .height = height, + .src_rect = {0, 0, width, height}, + + .repr = { + .bits = { + .sample_depth = bytes * 8, + .color_depth = dav1d_pic->p.bpc, + }, + }, + }; + + // Figure out the correct plane dimensions/count + switch (dav1d_pic->p.layout) { + case DAV1D_PIXEL_LAYOUT_I400: + image->num_planes = 1; + break; + case DAV1D_PIXEL_LAYOUT_I420: + sub_x = sub_y = 1; + break; + case DAV1D_PIXEL_LAYOUT_I422: + sub_x = 1; + break; + case DAV1D_PIXEL_LAYOUT_I444: + break; + } + + // Set the right colorspace metadata etc. + switch (dav1d_pic->seq_hdr->pri) { + case DAV1D_COLOR_PRI_UNKNOWN: image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break; + case DAV1D_COLOR_PRI_BT709: image->color.primaries = PL_COLOR_PRIM_BT_709; break; + case DAV1D_COLOR_PRI_BT470M: image->color.primaries = PL_COLOR_PRIM_BT_470M; break; + case DAV1D_COLOR_PRI_BT470BG: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; + case DAV1D_COLOR_PRI_BT601: image->color.primaries = PL_COLOR_PRIM_BT_601_625; break; + case DAV1D_COLOR_PRI_BT2020: image->color.primaries = PL_COLOR_PRIM_BT_2020; break; + + case DAV1D_COLOR_PRI_XYZ: + // Handled below + assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY); + break; + + default: + printf("warning: unknown dav1d color primaries %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->pri); + break; + } + + switch (dav1d_pic->seq_hdr->trc) { + case DAV1D_TRC_BT709: + case DAV1D_TRC_BT470M: + case DAV1D_TRC_BT470BG: + case DAV1D_TRC_BT601: + case DAV1D_TRC_SMPTE240: + case DAV1D_TRC_BT2020_10BIT: + case DAV1D_TRC_BT2020_12BIT: + // These all map to the effective "SDR" CRT-based EOTF, BT.1886 + image->color.transfer = PL_COLOR_TRC_BT_1886; + break; + + case DAV1D_TRC_UNKNOWN: image->color.transfer = PL_COLOR_TRC_UNKNOWN; break; + case DAV1D_TRC_LINEAR: image->color.transfer = PL_COLOR_TRC_LINEAR; break; + case DAV1D_TRC_SRGB: image->color.transfer = PL_COLOR_TRC_SRGB; break; + case DAV1D_TRC_SMPTE2084: image->color.transfer = PL_COLOR_TRC_PQ; break; + case DAV1D_TRC_HLG: image->color.transfer = PL_COLOR_TRC_HLG; break; + + default: + printf("warning: unknown dav1d color transfer %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->trc); + break; + } + + switch (dav1d_pic->seq_hdr->mtrx) { + case DAV1D_MC_IDENTITY: + // This is going to be either RGB or XYZ + if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) { + image->repr.sys = PL_COLOR_SYSTEM_XYZ; + } else { + image->repr.sys = PL_COLOR_SYSTEM_RGB; + } + break; + + case DAV1D_MC_UNKNOWN: + // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one + image->repr.sys = pl_color_system_guess_ycbcr(width, height); + break; + + case DAV1D_MC_BT709: image->repr.sys = PL_COLOR_SYSTEM_BT_709; break; + case DAV1D_MC_BT601: image->repr.sys = PL_COLOR_SYSTEM_BT_601; break; + case DAV1D_MC_SMPTE240: image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break; + case DAV1D_MC_SMPTE_YCGCO: image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break; + case DAV1D_MC_BT2020_NCL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break; + case DAV1D_MC_BT2020_CL: image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break; + + case DAV1D_MC_ICTCP: + // This one is split up based on the actual HDR curve in use + if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) { + image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG; + } else { + image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ; + } + break; + + default: + printf("warning: unknown dav1d color matrix %d.. ignoring, picture " + "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx); + break; + } + + if (dav1d_pic->seq_hdr->color_range) { + image->repr.levels = PL_COLOR_LEVELS_PC; + } else { + image->repr.levels = PL_COLOR_LEVELS_TV; + } + + switch (dav1d_pic->seq_hdr->chr) { + case DAV1D_CHR_UNKNOWN: chroma_loc = PL_CHROMA_UNKNOWN; break; + case DAV1D_CHR_VERTICAL: chroma_loc = PL_CHROMA_LEFT; break; + case DAV1D_CHR_COLOCATED: chroma_loc = PL_CHROMA_TOP_LEFT; break; + } + +#if PL_API_VER >= 63 + if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) { + Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data; + struct pl_av1_grain_data *dst = &image->av1_grain; + *dst = (struct pl_av1_grain_data) { + .grain_seed = src->seed, + .num_points_y = src->num_y_points, + .chroma_scaling_from_luma = src->chroma_scaling_from_luma, + .num_points_uv = { src->num_uv_points[0], src->num_uv_points[1] }, + .scaling_shift = src->scaling_shift, + .ar_coeff_lag = src->ar_coeff_lag, + .ar_coeff_shift = src->ar_coeff_shift, + .grain_scale_shift = src->grain_scale_shift, + .uv_mult = { src->uv_mult[0], src->uv_mult[1] }, + .uv_mult_luma = { src->uv_luma_mult[0], src->uv_luma_mult[1] }, + .uv_offset = { src->uv_offset[0], src->uv_offset[1] }, + .overlap = src->overlap_flag, + }; + + assert(sizeof(dst->points_y) == sizeof(src->y_points)); + assert(sizeof(dst->points_uv) == sizeof(src->uv_points)); + assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y)); + memcpy(dst->points_y, src->y_points, sizeof(src->y_points)); + memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points)); + memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y)); + + // this one has different row sizes for alignment + for (int c = 0; c < 2; c++) { + for (int i = 0; i < 25; i++) + dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i]; + } + } +#endif + + // Upload the actual planes + struct pl_plane_data data[3] = { + { + // Y plane + .type = PL_FMT_UNORM, + .width = width, + .height = height, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[0], + .component_size = {bytes * 8}, + .component_map = {0}, + }, { + // U plane + .type = PL_FMT_UNORM, + .width = width >> sub_x, + .height = height >> sub_y, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[1], + .component_size = {bytes * 8}, + .component_map = {1}, + }, { + // V plane + .type = PL_FMT_UNORM, + .width = width >> sub_x, + .height = height >> sub_y, + .pixel_stride = bytes, + .row_stride = dav1d_pic->stride[1], + .component_size = {bytes * 8}, + .component_map = {2}, + }, + }; + + bool ok = true; + + for (int i = 0; i < image->num_planes; i++) { + if (settings->zerocopy) { + const struct pl_buf *buf = dav1d_pic->allocator_data; + assert(buf); + data[i].buf = buf; + data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data; + } else { + data[i].pixels = dav1d_pic->data[i]; + } + + ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]); + } + + // Apply the correct chroma plane shift. This has to be done after pl_upload_plane +#if PL_API_VER >= 67 + pl_image_set_chroma_location(image, chroma_loc); +#else + pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y); + pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y); +#endif + + if (!ok) { + fprintf(stderr, "Failed uploading planes!\n"); + *image = (struct pl_image) {0}; + } + + SDL_UnlockMutex(rd_priv_ctx->lock); + return !ok; +} + +// Align to power of 2 +#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1)) + +static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + SDL_LockMutex(rd_priv_ctx->lock); + + const struct pl_gpu *gpu = rd_priv_ctx->gpu; + int ret = DAV1D_ERR(ENOMEM); + + // Copied from dav1d_default_picture_alloc + const int hbd = p->p.bpc > 8; + const int aligned_w = ALIGN2(p->p.w, 128); + const int aligned_h = ALIGN2(p->p.h, 128); + const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400; + const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420; + const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444; + p->stride[0] = aligned_w << hbd; + p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0; + + // Align strides up to multiples of the GPU performance hints + p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride); + p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride); + + // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2) + size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4); + const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align); + const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align); + + // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment, + // even in the case that the driver gives us insane alignments + const size_t pic_size = y_sz + 2 * uv_sz; + const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4; + + // Validate size limitations + if (total_size > gpu->limits.max_xfer_size) { + printf("alloc of %zu bytes exceeds limits\n", total_size); + goto err; + } + + const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) { + .type = PL_BUF_TEX_TRANSFER, + .host_mapped = true, + .size = total_size, + .memory_type = PL_BUF_MEM_HOST, + .user_data = p, + }); + + if (!buf) { + printf("alloc of GPU mapped buffer failed\n"); + goto err; + } + + assert(buf->data); + uintptr_t base = (uintptr_t) buf->data, data[3]; + data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT); + data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT); + data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT); + + // Sanity check offset alignment for the sake of debugging + if (data[0] - base != ALIGN2(data[0] - base, off_align) || + data[1] - base != ALIGN2(data[1] - base, off_align) || + data[2] - base != ALIGN2(data[2] - base, off_align)) + { + printf("GPU buffer horribly misaligned, expect slowdown!\n"); + } + + p->allocator_data = (void *) buf; + p->data[0] = (void *) data[0]; + p->data[1] = (void *) data[1]; + p->data[2] = (void *) data[2]; + ret = 0; + + // fall through +err: + SDL_UnlockMutex(rd_priv_ctx->lock); + return ret; +} + +static void placebo_release_pic(Dav1dPicture *pic, void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + assert(pic->allocator_data); + + SDL_LockMutex(rd_priv_ctx->lock); + const struct pl_gpu *gpu = rd_priv_ctx->gpu; + pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data); + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +#ifdef HAVE_PLACEBO_VULKAN +const Dav1dPlayRenderInfo rdr_placebo_vk = { + .name = "placebo-vk", + .create_renderer = placebo_renderer_create_vk, + .destroy_renderer = placebo_renderer_destroy, + .render = placebo_render, + .update_frame = placebo_upload_image, + .alloc_pic = placebo_alloc_pic, + .release_pic = placebo_release_pic, + +# if PL_API_VER >= 63 + .supports_gpu_grain = 1, +# endif +}; +#else +const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; +#endif + +#ifdef HAVE_PLACEBO_OPENGL +const Dav1dPlayRenderInfo rdr_placebo_gl = { + .name = "placebo-gl", + .create_renderer = placebo_renderer_create_gl, + .destroy_renderer = placebo_renderer_destroy, + .render = placebo_render, + .update_frame = placebo_upload_image, + .alloc_pic = placebo_alloc_pic, + .release_pic = placebo_release_pic, + +# if PL_API_VER >= 63 + .supports_gpu_grain = 1, +# endif +}; +#else +const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL }; +#endif + +#else +const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL }; +const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL }; +#endif diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c b/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c new file mode 100644 index 000000000..078d61349 --- /dev/null +++ b/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c @@ -0,0 +1,164 @@ +/* + * Copyright © 2020, VideoLAN and dav1d authors + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "dp_renderer.h" + +#include + +/** + * Renderer context for SDL + */ +typedef struct renderer_priv_ctx +{ + // SDL window + SDL_Window *win; + // SDL renderer + SDL_Renderer *renderer; + // Lock protecting access to the texture + SDL_mutex *lock; + // Texture to render + SDL_Texture *tex; +} Dav1dPlayRendererPrivateContext; + +static void *sdl_renderer_create() +{ + SDL_Window *win = dp_create_sdl_window(0); + if (win == NULL) + return NULL; + + // Alloc + Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext)); + if (rd_priv_ctx == NULL) { + return NULL; + } + rd_priv_ctx->win = win; + + // Create renderer + rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED); + // Set scale quality + SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear"); + + // Create Mutex + rd_priv_ctx->lock = SDL_CreateMutex(); + if (rd_priv_ctx->lock == NULL) { + fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError()); + free(rd_priv_ctx); + return NULL; + } + + rd_priv_ctx->tex = NULL; + + return rd_priv_ctx; +} + +static void sdl_renderer_destroy(void *cookie) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_DestroyRenderer(rd_priv_ctx->renderer); + SDL_DestroyMutex(rd_priv_ctx->lock); + free(rd_priv_ctx); +} + +static void sdl_render(void *cookie, const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (rd_priv_ctx->tex == NULL) { + SDL_UnlockMutex(rd_priv_ctx->lock); + return; + } + + // Display the frame + SDL_RenderClear(rd_priv_ctx->renderer); + SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL); + SDL_RenderPresent(rd_priv_ctx->renderer); + + SDL_UnlockMutex(rd_priv_ctx->lock); +} + +static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic, + const Dav1dPlaySettings *settings) +{ + Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie; + assert(rd_priv_ctx != NULL); + + SDL_LockMutex(rd_priv_ctx->lock); + + if (dav1d_pic == NULL) { + rd_priv_ctx->tex = NULL; + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; + } + + int width = dav1d_pic->p.w; + int height = dav1d_pic->p.h; + int tex_w = width; + int tex_h = height; + + enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout; + + if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) { + fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n"); + exit(50); + } + + SDL_Texture *texture = rd_priv_ctx->tex; + if (texture != NULL) { + SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h); + if (tex_w != width || tex_h != height) { + SDL_DestroyTexture(texture); + texture = NULL; + } + } + + if (texture == NULL) { + texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV, + SDL_TEXTUREACCESS_STREAMING, width, height); + } + + SDL_UpdateYUVTexture(texture, NULL, + dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y + dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U + dav1d_pic->data[2], (int)dav1d_pic->stride[1] // V + ); + + rd_priv_ctx->tex = texture; + SDL_UnlockMutex(rd_priv_ctx->lock); + return 0; +} + +const Dav1dPlayRenderInfo rdr_sdl = { + .name = "sdl", + .create_renderer = sdl_renderer_create, + .destroy_renderer = sdl_renderer_destroy, + .render = sdl_render, + .update_frame = sdl_update_texture +}; diff --git a/ffmpeg/JNI/dav1d/examples/meson.build b/ffmpeg/JNI/dav1d/examples/meson.build index bad1d902e..50e097a8d 100644 --- a/ffmpeg/JNI/dav1d/examples/meson.build +++ b/ffmpeg/JNI/dav1d/examples/meson.build @@ -35,28 +35,40 @@ endif # dav1d player sources dav1dplay_sources = files( 'dav1dplay.c', + 'dp_fifo.c', + 'dp_renderer_placebo.c', + 'dp_renderer_sdl.c', ) sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true) if sdl2_dependency.found() + dav1dplay_deps = [sdl2_dependency] + dav1dplay_cflags = [] + placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false) - vulkan_dependency = dependency('vulkan', required: false) - sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency]) - cflag_placebo = [] - deps_placebo = [] - if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan - cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1' - deps_placebo = [vulkan_dependency, placebo_dependency] + + if placebo_dependency.found() + dav1dplay_deps += placebo_dependency + dav1dplay_cflags += '-DHAVE_PLACEBO' + + # If libplacebo is found, we might be able to use Vulkan + # with it, in which case we need the Vulkan library too. + vulkan_dependency = dependency('vulkan', required: false) + if vulkan_dependency.found() + dav1dplay_deps += vulkan_dependency + dav1dplay_cflags += '-DHAVE_VULKAN' + endif endif + dav1dplay = executable('dav1dplay', dav1dplay_sources, rev_target, link_with : [libdav1d, dav1d_input_objs], include_directories : [dav1d_inc_dirs], - dependencies : [getopt_dependency, sdl2_dependency, deps_placebo], + dependencies : [getopt_dependency, dav1dplay_deps], install : true, - c_args : cflag_placebo, + c_args : dav1dplay_cflags, ) endif diff --git a/ffmpeg/JNI/dav1d/gcovr.cfg b/ffmpeg/JNI/dav1d/gcovr.cfg new file mode 100644 index 000000000..f768de8a6 --- /dev/null +++ b/ffmpeg/JNI/dav1d/gcovr.cfg @@ -0,0 +1,3 @@ +exclude = .*/tests/.* +exclude = .*/tools/.* +exclude = .*/include/common/dump.h diff --git a/ffmpeg/JNI/dav1d/include/common/attributes.h b/ffmpeg/JNI/dav1d/include/common/attributes.h index d5c4ce50b..0683b5044 100644 --- a/ffmpeg/JNI/dav1d/include/common/attributes.h +++ b/ffmpeg/JNI/dav1d/include/common/attributes.h @@ -159,4 +159,8 @@ static inline int clzll(const unsigned long long mask) { } #endif /* !_MSC_VER */ +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + #endif /* DAV1D_COMMON_ATTRIBUTES_H */ diff --git a/ffmpeg/JNI/dav1d/include/common/mem.h b/ffmpeg/JNI/dav1d/include/common/mem.h index a633b2ae9..74cdaf23a 100644 --- a/ffmpeg/JNI/dav1d/include/common/mem.h +++ b/ffmpeg/JNI/dav1d/include/common/mem.h @@ -37,13 +37,13 @@ #include "common/attributes.h" /* - * Allocate 32-byte aligned memory. The return value can be released - * by calling the standard free() function. + * Allocate align-byte aligned memory. The return value can be released + * by calling the dav1d_free_aligned() function. */ static inline void *dav1d_alloc_aligned(size_t sz, size_t align) { + assert(!(align & (align - 1))); #ifdef HAVE_POSIX_MEMALIGN void *ptr; - assert(!(align & (align - 1))); if (posix_memalign(&ptr, align, sz)) return NULL; return ptr; #elif defined(HAVE_ALIGNED_MALLOC) diff --git a/ffmpeg/JNI/dav1d/meson.build b/ffmpeg/JNI/dav1d/meson.build index 730229bb6..d5366f9a7 100644 --- a/ffmpeg/JNI/dav1d/meson.build +++ b/ffmpeg/JNI/dav1d/meson.build @@ -1,4 +1,4 @@ -# Copyright © 2018-2019, VideoLAN and dav1d authors +# Copyright © 2018-2020, VideoLAN and dav1d authors # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -23,14 +23,14 @@ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. project('dav1d', ['c'], - version: '0.6.0', + version: '0.7.1', default_options: ['c_std=c99', 'warning_level=2', 'buildtype=release', 'b_ndebug=if-release'], meson_version: '>= 0.47.0') -dav1d_soname_version = '4.0.0' +dav1d_soname_version = '4.0.2' dav1d_api_version_array = dav1d_soname_version.split('.') dav1d_api_version_major = dav1d_api_version_array[0] dav1d_api_version_minor = dav1d_api_version_array[1] @@ -196,10 +196,10 @@ else getopt_dependency = [] endif -if cc.has_function('posix_memalign', prefix : '#include ', args : test_args) - cdata.set('HAVE_POSIX_MEMALIGN', 1) -elif cc.has_function('_aligned_malloc', prefix : '#include ', args : test_args) +if cc.has_function('_aligned_malloc', prefix : '#include ', args : test_args) cdata.set('HAVE_ALIGNED_MALLOC', 1) +elif cc.has_function('posix_memalign', prefix : '#include ', args : test_args) + cdata.set('HAVE_POSIX_MEMALIGN', 1) elif cc.has_function('memalign', prefix : '#include ', args : test_args) cdata.set('HAVE_MEMALIGN', 1) endif @@ -362,20 +362,11 @@ if cc.symbols_have_underscore_prefix() cdata_asm.set10('PREFIX', true) endif -# Generate config.h -config_h_target = configure_file(output: 'config.h', configuration: cdata) - - - # # ASM specific stuff # if is_asm_enabled and host_machine.cpu_family().startswith('x86') - # Generate config.asm - config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm) - - # NASM compiler support nasm = find_program('nasm') @@ -390,14 +381,22 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') out = nasm_r.stdout().strip().split() if out[1].to_lower() == 'version' - if out[2].version_compare('<2.14') - error('nasm 2.14 or later is required, found nasm @0@'.format(out[2])) + if out[2].version_compare('<2.13.02') + error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2])) + elif out[2].version_compare('<2.14') and get_option('enable_avx512') + error('nasm 2.14 or later is required for AVX-512 asm.\n' + + 'AVX-512 asm can be disabled with \'-Denable_avx512=false\'') endif + cdata.set10('HAVE_AVX512ICL', get_option('enable_avx512')) + cdata_asm.set10('HAVE_AVX512ICL', get_option('enable_avx512')) else error('unexpected nasm version string: @0@'.format(nasm_r.stdout())) endif endif + # Generate config.asm + config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm) + if host_machine.system() == 'windows' nasm_format = 'win' elif host_machine.system() == 'darwin' @@ -416,7 +415,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') depfile: '@BASENAME@.obj.ndep', arguments: [ '-f', nasm_format, - '-I', '@0@/src/'.format(meson.current_source_dir()), + '-I', '@0@/src/'.format(dav1d_src_root), '-I', '@0@/'.format(meson.current_build_dir()), '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@', '@EXTRA_ARGS@', @@ -426,6 +425,10 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86') endif +# Generate config.h +config_h_target = configure_file(output: 'config.h', configuration: cdata) + + # # Include subdir meson.build files diff --git a/ffmpeg/JNI/dav1d/meson_options.txt b/ffmpeg/JNI/dav1d/meson_options.txt index cdd27c2df..37bd08433 100644 --- a/ffmpeg/JNI/dav1d/meson_options.txt +++ b/ffmpeg/JNI/dav1d/meson_options.txt @@ -10,6 +10,11 @@ option('enable_asm', value: true, description: 'Build asm files, if available') +option('enable_avx512', + type: 'boolean', + value: true, + description: 'Build AVX-512 asm files, requires nasm 2.14') + option('enable_tools', type: 'boolean', value: true, diff --git a/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson b/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson new file mode 100644 index 000000000..a25ea4325 --- /dev/null +++ b/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson @@ -0,0 +1,16 @@ +[binaries] +c = 'aarch64-linux-android21-clang' +cpp = 'aarch64-linux-android21-clang++' +ar = 'aarch64-linux-android-ar' +strip = 'aarch64-linux-android-strip' +pkgconfig = 'pkg-config' +windres = 'aarch64-linux-android-windres' + +[properties] +needs_exe_wrapper = true + +[host_machine] +system = 'android' +cpu_family = 'aarch64' +endian = 'little' +cpu = 'aarch64' diff --git a/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson b/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson new file mode 100644 index 000000000..dd07d98ea --- /dev/null +++ b/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson @@ -0,0 +1,16 @@ +[binaries] +c = 'armv7a-linux-androideabi16-clang' +cpp = 'armv7a-linux-androideabi16-clang++' +ar = 'arm-linux-androideabi-ar' +strip = 'arm-linux-androideabi-strip' +pkgconfig = 'pkg-config' +windres = 'arm-linux-androideabi-windres' + +[properties] +needs_exe_wrapper = true + +[host_machine] +system = 'android' +cpu_family = 'arm' +endian = 'little' +cpu = 'arm' diff --git a/ffmpeg/JNI/dav1d/src/arm/32/ipred.S b/ffmpeg/JNI/dav1d/src/arm/32/ipred.S index f26e55f77..d850a0cef 100644 --- a/ffmpeg/JNI/dav1d/src/arm/32/ipred.S +++ b/ffmpeg/JNI/dav1d/src/arm/32/ipred.S @@ -29,11 +29,11 @@ #include "src/arm/asm.S" #include "util.S" -// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_128_neon, export=1 +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 push {r4, lr} ldr r4, [sp, #8] clz r3, r3 @@ -107,11 +107,11 @@ L(ipred_dc_128_tbl): pop {r4, pc} endfunc -// void ipred_v_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_v_neon, export=1 +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 push {r4, lr} ldr lr, [sp, #8] clz r3, r3 @@ -189,11 +189,11 @@ L(ipred_v_tbl): pop {r4, pc} endfunc -// void ipred_h_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_h_neon, export=1 +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 @@ -297,11 +297,11 @@ L(ipred_h_tbl): pop {r4-r5, pc} endfunc -// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_top_neon, export=1 +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] clz r3, r3 @@ -418,11 +418,11 @@ L(ipred_dc_top_tbl): pop {r4-r5, pc} endfunc -// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_left_neon, export=1 +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 push {r4-r5, lr} ldr r4, [sp, #12] sub r2, r2, r4 @@ -556,11 +556,11 @@ L(ipred_dc_left_w64): pop {r4-r5, pc} endfunc -// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_neon, export=1 +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 push {r4-r6, lr} ldr r4, [sp, #16] sub r2, r2, r4 @@ -765,10 +765,6 @@ L(ipred_dc_h64): vpadd.u16 d0, d0 bx r3 L(ipred_dc_w64): - vmov.8 q1, q0 - vmov.8 q2, q0 - vmov.8 q3, q0 -2: add r2, r2, #1 vld1.8 {d2, d3, d4, d5}, [r2]! vadd.s16 d0, d0, d30 diff --git a/ffmpeg/JNI/dav1d/src/arm/32/itx.S b/ffmpeg/JNI/dav1d/src/arm/32/itx.S new file mode 100644 index 000000000..867eb194d --- /dev/null +++ b/ffmpeg/JNI/dav1d/src/arm/32/itx.S @@ -0,0 +1,3386 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob); + +// Most of the functions use the following register layout: +// r0-r3 external parameters +// r4 function pointer to first transform +// r5 function pointer to second transform +// r6 output parameter for helper function +// r7 input parameter for helper function +// r8 input stride for helper function +// r9 scratch variable for helper functions +// r10-r11 pointer to list of eob thresholds, eob threshold value, +// scratch variables within helper functions (backed up) + +// The SIMD registers most often use the following layout: +// d0-d3 multiplication coefficients +// d4-d7 scratch registers +// d8-d15 unused in some transforms, used for scratch registers in others +// d16-v31 inputs/outputs of transforms + +// Potential further optimizations, that are left unimplemented for now: +// - Trying to keep multiplication coefficients in registers across multiple +// transform functions. (The register layout is designed to potentially +// allow this.) +// - Use a simplified version of the transforms themselves for cases where +// we know a significant number of inputs are zero. E.g. if the eob value +// indicates only a quarter of input values are set, for idct16 and up, +// a significant amount of calculation can be skipped, at the cost of more +// code duplication and special casing. + +const idct_coeffs, align=4 + // idct4 + .short 2896, 2896*8, 1567, 3784 + // idct8 + .short 799, 4017, 3406, 2276 + // idct16 + .short 401, 4076, 3166, 2598 + .short 1931, 3612, 3920, 1189 + // idct32 + .short 201, 4091, 3035, 2751 + .short 1751, 3703, 3857, 1380 + .short 995, 3973, 3513, 2106 + .short 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .short 101*8, 4095*8, 2967*8, -2824*8 + .short 1660*8, 3745*8, 3822*8, -1474*8 + .short 4076, 401, 4017, 799 + + .short 4036*8, -700*8, 2359*8, 3349*8 + .short 3461*8, -2191*8, 897*8, 3996*8 + .short -3166, -2598, -799, -4017 + + .short 501*8, 4065*8, 3229*8, -2520*8 + .short 2019*8, 3564*8, 3948*8, -1092*8 + .short 3612, 1931, 2276, 3406 + + .short 4085*8, -301*8, 2675*8, 3102*8 + .short 3659*8, -1842*8, 1285*8, 3889*8 + .short -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + // .h[4-5] can be interpreted as .s[2] + .short 1321, 3803, 2482, 3344, 3344, 0 +endconst + +const iadst8_coeffs, align=4 + .short 4076, 401, 3612, 1931 + .short 2598, 3166, 1189, 3920 + // idct_coeffs + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 +endconst + +const iadst16_coeffs, align=4 + .short 4091, 201, 3973, 995 + .short 3703, 1751, 3290, 2440 + .short 2751, 3035, 2106, 3513 + .short 1380, 3857, 601, 4052 +endconst + +.macro vmull_vmlal d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlal.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlal.s16 \d1, \s3, \c1 +.endm + +.macro vmull_vmlsl d0, s0, s1, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s1, \c1 +.endm + +.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1 + vmull.s16 \d0, \s0, \c0 + vmlsl.s16 \d0, \s2, \c1 + vmull.s16 \d1, \s1, \c0 + vmlsl.s16 \d1, \s3, \c1 +.endm + +.macro vrshrn_8h d0, d1, s0, s1, shift + vrshrn.i32 \d0, \s0, \shift + vrshrn.i32 \d1, \s1, \shift +.endm + +.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7 + vqrdmulh.s16 \r0, \r0, \c + vqrdmulh.s16 \r1, \r1, \c +.ifnb \r2 + vqrdmulh.s16 \r2, \r2, \c + vqrdmulh.s16 \r3, \r3, \c +.endif +.ifnb \r4 + vqrdmulh.s16 \r4, \r4, \c + vqrdmulh.s16 \r5, \r5, \c + vqrdmulh.s16 \r6, \r6, \c + vqrdmulh.s16 \r7, \r7, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4 +.ifnb \load + vld1.8 {\load}, [\src, :64], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.8 {\store}, [\dst, :64], r1 +.endif +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src, \shiftbits + load_add_store d3, q9, , , , , , \dst, \src, \shiftbits + load_add_store d4, q10, d2, q8, , , , \dst, \src, \shiftbits + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src, \shiftbits + load_add_store d6, q12, d4, q10, q9, d3, d2, \dst, \src, \shiftbits + load_add_store d7, q13, d5, q11, q10, d4, d3, \dst, \src, \shiftbits + load_add_store d2, q14, d6, q12, q11, d5, d4, \dst, \src, \shiftbits + load_add_store d3, q15, d7, q13, q12, d6, d5, \dst, \src, \shiftbits + load_add_store , , d2, q14, q13, d7, d6, \dst, \src, \shiftbits + load_add_store , , d3, q15, q14, d2, d7, \dst, \src, \shiftbits + load_add_store , , , , q15, d3, d2, \dst, \src, \shiftbits + load_add_store , , , , , , d3, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src + mov \src, \dst + load_add_store d2, q8, , , , , , \dst, \src + load_add_store d3, q9, , , , , , \dst, \src + load_add_store d4, q10, d2, q8, , , , \dst, \src + load_add_store d5, q11, d3, q9, q8, d2, , \dst, \src + load_add_store , , d4, q10, q9, d3, d2, \dst, \src + load_add_store , , d5, q11, q10, d4, d3, \dst, \src + load_add_store , , , , q11, d5, d4, \dst, \src + load_add_store , , , , , , d5, \dst, \src +.endm +.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src +.ifnb \load + vld1.32 {\load[0]}, [\src, :32], r1 +.endif +.ifnb \shift + vrshr.s16 \shift, \shift, #4 +.endif +.ifnb \load + vld1.32 {\load[1]}, [\src, :32], r1 +.endif +.ifnb \addsrc + vaddw.u8 \adddst, \adddst, \addsrc +.endif +.ifnb \store + vst1.32 {\store[0]}, [\dst, :32], r1 +.endif +.ifnb \narrowsrc + vqmovun.s16 \narrowdst, \narrowsrc +.endif +.ifnb \store + vst1.32 {\store[1]}, [\dst, :32], r1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 d4, q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 d5, q12, d3, q11, q10, d2, d1, \dst, \src + load_add_store4 d6, q13, d4, q12, q11, d3, d2, \dst, \src + load_add_store4 d7, q14, d5, q13, q12, d4, d3, \dst, \src + load_add_store4 , q15, d6, q14, q13, d5, d4, \dst, \src + load_add_store4 , , d7, q15, q14, d6, d5, \dst, \src + load_add_store4 , , , , q15, d7, d6, \dst, \src + load_add_store4 , , , , , , d7, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + load_add_store4 d0, , , , , , , \dst, \src + load_add_store4 d1, q8, , , , , , \dst, \src + load_add_store4 d2, q9, d0, q8, , , , \dst, \src + load_add_store4 d3, q10, d1, q9, q8, d0, , \dst, \src + load_add_store4 , q11, d2, q10, q9, d1, d0, \dst, \src + load_add_store4 , , d3, q11, q10, d2, d1, \dst, \src + load_add_store4 , , , , q11, d3, d2, \dst, \src + load_add_store4 , , , , , , d3, \dst, \src +.endm + +.macro idct_dc w, h, shift + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d0, r12 + vqrdmulh.s16 d16, d16, d0[0] + vst1.16 {d30[0]}, [r2, :16] +.if (\w == 2*\h) || (2*\w == \h) + vqrdmulh.s16 d16, d16, d0[0] +.endif +.if \shift > 0 + vrshr.s16 d16, d16, #\shift +.endif + vqrdmulh.s16 d20, d16, d0[0] + mov r3, #\h + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon +1: + vld1.32 {d0[0]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[0]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + subs r3, r3, #4 + sub r0, r0, r1, lsl #2 + vaddw.u8 q10, q8, d0 + vqmovun.s16 d0, q10 + vaddw.u8 q11, q8, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q11 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w8_neon +1: + vld1.8 {d0}, [r0, :64], r1 + vld1.8 {d1}, [r0, :64], r1 + vld1.8 {d2}, [r0, :64], r1 + vaddw.u8 q10, q8, d0 + vld1.8 {d3}, [r0, :64], r1 + sub r0, r0, r1, lsl #2 + subs r3, r3, #4 + vaddw.u8 q11, q8, d1 + vqmovun.s16 d0, q10 + vaddw.u8 q12, q8, d2 + vqmovun.s16 d1, q11 + vaddw.u8 q13, q8, d3 + vst1.8 {d0}, [r0, :64], r1 + vqmovun.s16 d2, q12 + vst1.8 {d1}, [r0, :64], r1 + vqmovun.s16 d3, q13 + vst1.8 {d2}, [r0, :64], r1 + vst1.8 {d3}, [r0, :64], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w16_neon +1: + vld1.8 {q0}, [r0, :128], r1 + vld1.8 {q1}, [r0, :128], r1 + vld1.8 {q2}, [r0, :128], r1 + subs r3, r3, #4 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vld1.8 {q3}, [r0, :128], r1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #2 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q1}, [r0, :128], r1 + vst1.8 {q2}, [r0, :128], r1 + vst1.8 {q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w32_neon +1: + vld1.8 {q0, q1}, [r0, :128], r1 + subs r3, r3, #2 + vld1.8 {q2, q3}, [r0, :128], r1 + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, r1, lsl #1 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128], r1 + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +function idct_dc_w64_neon + sub r1, r1, #32 +1: + vld1.8 {q0, q1}, [r0, :128]! + subs r3, r3, #1 + vld1.8 {q2, q3}, [r0, :128] + vaddw.u8 q10, q8, d0 + vaddw.u8 q11, q8, d1 + vaddw.u8 q12, q8, d2 + vaddw.u8 q13, q8, d3 + sub r0, r0, #32 + vaddw.u8 q14, q8, d4 + vaddw.u8 q15, q8, d5 + vqmovun.s16 d0, q10 + vqmovun.s16 d1, q11 + vaddw.u8 q10, q8, d6 + vaddw.u8 q11, q8, d7 + vqmovun.s16 d2, q12 + vqmovun.s16 d3, q13 + vqmovun.s16 d4, q14 + vqmovun.s16 d5, q15 + vst1.8 {q0, q1}, [r0, :128]! + vqmovun.s16 d6, q10 + vqmovun.s16 d7, q11 + vst1.8 {q2, q3}, [r0, :128], r1 + bgt 1b + bx lr +endfunc + +.macro iwht4 + vadd.i16 d16, d16, d17 + vsub.i16 d21, d18, d19 + vsub.i16 d20, d16, d21 + vshr.s16 d20, d20, #1 + vsub.i16 d18, d20, d17 + vsub.i16 d17, d20, d19 + vadd.i16 d19, d21, d18 + vsub.i16 d16, d16, d17 +.endm + +.macro idct_4h_x4 r0, r1, r2, r3 + vmull_vmlal q3, \r1, \r3, d0[3], d0[2] + vmull_vmlsl q2, \r1, \r3, d0[2], d0[3] + vmull_vmlal q1, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d6, q3, #12 + vrshrn.i32 d7, q2, #12 + vmull_vmlsl q2, \r0, \r2, d0[0], d0[0] + vrshrn.i32 d2, q1, #12 + vrshrn.i32 d3, q2, #12 + vqadd.s16 \r0, d2, d6 + vqsub.s16 \r3, d2, d6 + vqadd.s16 \r1, d3, d7 + vqsub.s16 \r2, d3, d7 +.endm + +.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7 + vmull_vmlal_8h q6, q7, \r2, \r3, \r6, \r7, d0[3], d0[2] + vmull_vmlsl_8h q4, q5, \r2, \r3, \r6, \r7, d0[2], d0[3] + vmull_vmlal_8h q2, q3, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d12, d13, q6, q7, #12 + vrshrn_8h d14, d15, q4, q5, #12 + vmull_vmlsl_8h q4, q5, \r0, \r1, \r4, \r5, d0[0], d0[0] + vrshrn_8h d4, d5, q2, q3, #12 + vrshrn_8h d6, d7, q4, q5, #12 + vqadd.s16 \q0, q2, q6 + vqsub.s16 \q3, q2, q6 + vqadd.s16 \q1, q3, q7 + vqsub.s16 \q2, q3, q7 +.endm + +function inv_dct_4h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_4h_x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_dct_8h_x4_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] + idct_8h_x4 q8, q9, q10, q11, d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q1, d16, d18 + vmull.s16 q2, d16, d0[0] + vmlal.s16 q2, d18, d0[1] + vmlal.s16 q2, d19, d0[2] + vmull.s16 q10, d17, d0[3] + vaddw.s16 q1, q1, d19 + vmull.s16 q3, d16, d0[2] + vmlsl.s16 q3, d18, d0[0] + vmlsl.s16 q3, d19, d0[1] + + vadd.s32 q11, q2, q3 + vmul.s32 q1, q1, d1[0] + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q10 + vsub.s32 q11, q11, q10 + + vrshrn.i32 \o0, q2, #12 + vrshrn.i32 \o2, q1, #12 + vrshrn.i32 \o1, q3, #12 + vrshrn.i32 \o3, q11, #12 +.endm + +function inv_adst_4h_x4_neon, export=1 + iadst_4x4 d16, d17, d18, d19 + bx lr +endfunc + +function inv_flipadst_4h_x4_neon, export=1 + iadst_4x4 d19, d18, d17, d16 + bx lr +endfunc + +.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7 + movrel_local r12, iadst4_coeffs + vld1.16 {d0, d1}, [r12, :128] + + vsubl.s16 q2, d16, d20 + vsubl.s16 q3, d17, d21 + vmull.s16 q4, d16, d0[0] + vmlal.s16 q4, d20, d0[1] + vmlal.s16 q4, d22, d0[2] + vmull.s16 q5, d17, d0[0] + vmlal.s16 q5, d21, d0[1] + vmlal.s16 q5, d23, d0[2] + vaddw.s16 q2, q2, d22 + vaddw.s16 q3, q3, d23 + vmull.s16 q6, d16, d0[2] + vmlsl.s16 q6, d20, d0[0] + vmlsl.s16 q6, d22, d0[1] + vmull.s16 q7, d17, d0[2] + vmlsl.s16 q7, d21, d0[0] + vmlsl.s16 q7, d23, d0[1] + + vmul.s32 q10, q2, d1[0] + vmul.s32 q11, q3, d1[0] + + vmull.s16 q2, d18, d0[3] + vmull.s16 q3, d19, d0[3] + + vadd.s32 q8, q4, q2 // out0 + vadd.s32 q9, q5, q3 + + vadd.s32 q4, q4, q6 // out3 + vadd.s32 q5, q5, q7 + + vadd.s32 q6, q6, q2 // out1 + vadd.s32 q7, q7, q3 + + vsub.s32 q4, q4, q2 // out3 + vsub.s32 q5, q5, q3 + + vrshrn.i32 d20, q10, #12 + vrshrn.i32 d21, q11, #12 + + vrshrn.i32 \o0, q8, #12 + vrshrn.i32 \o1, q9, #12 + +.ifc \o4, d18 + vmov q9, q10 +.endif + + vrshrn.i32 \o2, q6, #12 + vrshrn.i32 \o3, q7, #12 + + vrshrn.i32 \o6, q4, #12 + vrshrn.i32 \o7, q5, #12 +.endm + +function inv_adst_8h_x4_neon, export=1 + iadst_8x4 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_8h_x4_neon, export=1 + iadst_8x4 d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_identity_4h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q2, q8, d0[0] + vqrdmulh.s16 q3, q9, d0[0] + vqadd.s16 q8, q8, q2 + vqadd.s16 q9, q9, q3 + bx lr +endfunc + +function inv_identity_8h_x4_neon, export=1 + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + vqrdmulh.s16 q1, q8, d0[0] + vqrdmulh.s16 q2, q9, d0[0] + vqrdmulh.s16 q3, q10, d0[0] + vqadd.s16 q8, q8, q1 + vqrdmulh.s16 q1, q11, d0[0] + vqadd.s16 q9, q9, q2 + vqadd.s16 q10, q10, q3 + vqadd.s16 q11, q11, q1 + bx lr +endfunc + +.macro identity_8x4_shift1 r0, r1, r2, r3, c +.irp i, \r0, \r1, \r2, \r3 + vqrdmulh.s16 q1, \i, \c + vrhadd.s16 \i, \i, q1 +.endr +.endm + +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + vshr.s16 q8, q8, #2 + vshr.s16 q9, q9, #2 + + iwht4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + iwht4 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + vmov.i16 q15, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q15}, [r2, :128]! + + blx r4 + + vst1.16 {q15}, [r2, :128]! + transpose_4x4h q8, q9, d16, d17, d18, d19 + + blx r5 + + vld1.32 {d0[]}, [r0, :32], r1 + vld1.32 {d0[1]}, [r0, :32], r1 + vld1.32 {d1[]}, [r0, :32], r1 + vld1.32 {d1[1]}, [r0, :32], r1 + vrshr.s16 q8, q8, #4 + vrshr.s16 q9, q9, #4 + +L(itx_4x4_end): + sub r0, r0, r1, lsl #2 + vaddw.u8 q8, q8, d0 + vqmovun.s16 d0, q8 + vaddw.u8 q9, q9, d1 + vst1.32 {d0[0]}, [r0, :32], r1 + vqmovun.s16 d1, q9 + vst1.32 {d0[1]}, [r0, :32], r1 + vst1.32 {d1[0]}, [r0, :32], r1 + vst1.32 {d1[1]}, [r0, :32], r1 + + pop {r4-r5,pc} +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 + push {r4-r5,lr} + +.ifc \txfm1\()_\txfm2, dct_dct + cmp r3, #0 + bne 1f + vmov.i16 d30, #0 + movw r12, #2896*8 + vld1.16 {d16[]}, [r2, :16] + vdup.16 d4, r12 + vst1.16 {d30[0]}, [r2, :16] + vqrdmulh.s16 d16, d16, d4[0] + vld1.32 {d0[0]}, [r0, :32], r1 + vqrdmulh.s16 d20, d16, d4[0] + vld1.32 {d0[1]}, [r0, :32], r1 + vrshr.s16 d16, d20, #4 + vrshr.s16 d17, d20, #4 + vld1.32 {d1[0]}, [r0, :32], r1 + vmov q9, q8 + vld1.32 {d1[1]}, [r0, :32], r1 + b L(itx_4x4_end) +1: +.endif + movrel_local r4, inv_\txfm1\()_4h_x4_neon + movrel_local r5, inv_\txfm2\()_4h_x4_neon + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + idct_8h_x4 \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13 + + vmull_vmlsl_8h q2, q3, \r2, \r3, \r14, \r15, d1[0], d1[1] // -> t4a + vmull_vmlal_8h q4, q5, \r2, \r3, \r14, \r15, d1[1], d1[0] // -> t7a + vmull_vmlsl_8h q6, q7, \r10, \r11, \r6, \r7, d1[2], d1[3] // -> t5a + vrshrn_8h \r2, \r3, q2, q3, #12 // t4a + vrshrn_8h \r14, \r15, q4, q5, #12 // t7a + vmull_vmlal_8h q2, q3, \r10, \r11, \r6, \r7, d1[3], d1[2] // -> t6a + vrshrn_8h \r6, \r7, q6, q7, #12 // t5a + vrshrn_8h \r10, \r11, q2, q3, #12 // taa + + vqadd.s16 q2, \q1, \q3 // t4 + vqsub.s16 \q1, \q1, \q3 // t5a + vqadd.s16 q3, \q7, \q5 // t7 + vqsub.s16 \q3, \q7, \q5 // t6a + + vmull_vmlsl_8h q4, q5, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t5 + vmull_vmlal_8h q6, q7, \r6, \r7, \r2, \r3, d0[0], d0[0] // -> t6 + vrshrn_8h d8, d9, q4, q5, #12 // t5 + vrshrn_8h d10, d11, q6, q7, #12 // t6 + + vqsub.s16 \q7, \q0, q3 // out7 + vqadd.s16 \q0, \q0, q3 // out0 + vqadd.s16 \q1, \q2, q5 // out1 + vqsub.s16 q6, \q2, q5 // out6 + vqadd.s16 \q2, \q4, q4 // out2 + vqsub.s16 \q5, \q4, q4 // out5 + vqadd.s16 \q3, \q6, q2 // out3 + vqsub.s16 \q4, \q6, q2 // out4 + vmov \q6, q6 // out6 +.endm + +.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4h_x4 \r0, \r2, \r4, \r6 + + vmull_vmlsl q1, \r1, \r7, d1[0], d1[1] // -> t4a + vmull_vmlal q2, \r1, \r7, d1[1], d1[0] // -> t7a + vmull_vmlsl q3, \r5, \r3, d1[2], d1[3] // -> t5a + vrshrn.i32 \r1, q1, #12 // t4a + vmull_vmlal q1, \r5, \r3, d1[3], d1[2] // -> t6a + vrshrn.i32 \r7, q2, #12 // t7a + vrshrn.i32 \r3, q3, #12 // t5a + vrshrn.i32 \r5, q1, #12 // taa + + vqadd.s16 d2, \r1, \r3 // t4 + vqsub.s16 \r1, \r1, \r3 // t5a + vqadd.s16 d3, \r7, \r5 // t7 + vqsub.s16 \r3, \r7, \r5 // t6a + + vmull_vmlsl q2, \r3, \r1, d0[0], d0[0] // -> t5 + vmull_vmlal q3, \r3, \r1, d0[0], d0[0] // -> t6 + vrshrn.i32 d4, q2, #12 // t5 + vrshrn.i32 d5, q3, #12 // t6 + + vqsub.s16 \r7, \r0, d3 // out7 + vqadd.s16 \r0, \r0, d3 // out0 + vqadd.s16 \r1, \r2, d5 // out1 + vqsub.s16 d6, \r2, d5 // out6 + vqadd.s16 \r2, \r4, d4 // out2 + vqsub.s16 \r5, \r4, d4 // out5 + vqadd.s16 \r3, \r6, d2 // out3 + vqsub.s16 \r4, \r6, d2 // out4 + vmov \r6, d6 // out6 +.endm + +function inv_dct_8h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_dct_4h_x8_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0}, [r12, :128] + idct_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal_8h q2, q3, d30, d31, d16, d17, d0[0], d0[1] + vmull_vmlsl_8h q4, q5, d30, d31, d16, d17, d0[1], d0[0] + vmull_vmlal_8h q6, q7, d26, d27, d20, d21, d0[2], d0[3] + vrshrn_8h d16, d17, q2, q3, #12 // t0a + vrshrn_8h d30, d31, q4, q5, #12 // t1a + vmull_vmlsl_8h q2, q3, d26, d27, d20, d21, d0[3], d0[2] + vmull_vmlal_8h q4, q5, d22, d23, d24, d25, d1[0], d1[1] + vrshrn_8h d20, d21, q6, q7, #12 // t2a + vrshrn_8h d26, d27, q2, q3, #12 // t3a + vmull_vmlsl_8h q6, q7, d22, d23, d24, d25, d1[1], d1[0] + vmull_vmlal_8h q2, q3, d18, d19, d28, d29, d1[2], d1[3] + vrshrn_8h d24, d25, q4, q5, #12 // t4a + vrshrn_8h d22, d23, q6, q7, #12 // t5a + vmull_vmlsl_8h q4, q5, d18, d19, d28, d29, d1[3], d1[2] + vrshrn_8h d28, d29, q2, q3, #12 // t6a + vrshrn_8h d18, d19, q4, q5, #12 // t7a + + vqadd.s16 q2, q8, q12 // t0 + vqsub.s16 q3, q8, q12 // t4 + vqadd.s16 q4, q15, q11 // t1 + vqsub.s16 q5, q15, q11 // t5 + vqadd.s16 q6, q10, q14 // t2 + vqsub.s16 q7, q10, q14 // t6 + vqadd.s16 q10, q13, q9 // t3 + vqsub.s16 q11, q13, q9 // t7 + + vmull_vmlal_8h q8, q9, d6, d7, d10, d11, d2[3], d2[2] + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[2], d2[3] + vmull_vmlsl_8h q14, q15, d22, d23, d14, d15, d2[3], d2[2] + + vrshrn_8h d6, d7, q8, q9, #12 // t4a + vrshrn_8h d10, d11, q12, q13, #12 // t5a + + vmull_vmlal_8h q8, q9, d22, d23, d14, d15, d2[2], d2[3] + + vrshrn_8h d14, d15, q14, q15, #12 // t6a + vrshrn_8h d22, d23, q8, q9, #12 // t7a + + vqadd.s16 \q0, q2, q6 // out0 + vqsub.s16 q2, q2, q6 // t2 + vqadd.s16 \q7, q4, q10 // out7 + vqsub.s16 q4, q4, q10 // t3 + vqneg.s16 \q7, \q7 // out7 + + vqadd.s16 \q1, q3, q7 // out1 + vqsub.s16 q3, q3, q7 // t6 + vqadd.s16 \q6, q5, q11 // out6 + vqsub.s16 q5, q5, q11 // t7 + vqneg.s16 \q1, \q1 // out1 + + vmull_vmlal_8h q10, q11, d4, d5, d8, d9, d2[0], d2[0] // -> out3 (q11 or q12) + vmull_vmlsl_8h q6, q7, d4, d5, d8, d9, d2[0], d2[0] // -> out4 (q12 or q11) + vmull_vmlsl_8h q12, q13, d6, d7, d10, d11, d2[0], d2[0] // -> out5 (q13 or q10) + vrshrn_8h d4, d5, q10, q11, #12 // out3 + vmull_vmlal_8h q10, q11, d6, d7, d10, d11, d2[0], d2[0] // -> out2 (q10 or q13) + vrshrn_8h d6, d7, q12, q13, #12 // out5 + vrshrn_8h \r4, \r5, q10, q11, #12 // out2 (q10 or q13) + vrshrn_8h \r8, \r9, q6, q7, #12 // out4 (q12 or q11) + + vqneg.s16 \q3, q2 // out3 + vqneg.s16 \q5, q3 // out5 +.endm + +.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7 + movrel_local r12, iadst8_coeffs + vld1.16 {d0, d1, d2}, [r12, :64] + + vmull_vmlal q2, d23, d16, d0[0], d0[1] + vmull_vmlsl q3, d23, d16, d0[1], d0[0] + vmull_vmlal q4, d21, d18, d0[2], d0[3] + vrshrn.i32 d16, q2, #12 // t0a + vrshrn.i32 d23, q3, #12 // t1a + vmull_vmlsl q5, d21, d18, d0[3], d0[2] + vmull_vmlal q6, d19, d20, d1[0], d1[1] + vrshrn.i32 d18, q4, #12 // t2a + vrshrn.i32 d21, q5, #12 // t3a + vmull_vmlsl q7, d19, d20, d1[1], d1[0] + vmull_vmlal q2, d17, d22, d1[2], d1[3] + vrshrn.i32 d20, q6, #12 // t4a + vrshrn.i32 d19, q7, #12 // t5a + vmull_vmlsl q3, d17, d22, d1[3], d1[2] + vrshrn.i32 d22, q2, #12 // t6a + vrshrn.i32 d17, q3, #12 // t7a + + vqadd.s16 d4, d16, d20 // t0 + vqsub.s16 d5, d16, d20 // t4 + vqadd.s16 d6, d23, d19 // t1 + vqsub.s16 d7, d23, d19 // t5 + vqadd.s16 d8, d18, d22 // t2 + vqsub.s16 d9, d18, d22 // t6 + vqadd.s16 d18, d21, d17 // t3 + vqsub.s16 d19, d21, d17 // t7 + + vmull_vmlal q8, d5, d7, d2[3], d2[2] + vmull_vmlsl q10, d5, d7, d2[2], d2[3] + vmull_vmlsl q11, d19, d9, d2[3], d2[2] + + vrshrn.i32 d5, q8, #12 // t4a + vrshrn.i32 d7, q10, #12 // t5a + + vmull_vmlal q8, d19, d9, d2[2], d2[3] + + vrshrn.i32 d9, q11, #12 // t6a + vrshrn.i32 d19, q8, #12 // t7a + + vqadd.s16 \r0, d4, d8 // out0 + vqsub.s16 d4, d4, d8 // t2 + vqadd.s16 \r7, d6, d18 // out7 + vqsub.s16 d6, d6, d18 // t3 + vqneg.s16 \r7, \r7 // out7 + + vqadd.s16 \r1, d5, d9 // out1 + vqsub.s16 d5, d5, d9 // t6 + vqadd.s16 \r6, d7, d19 // out6 + vqsub.s16 d7, d7, d19 // t7 + vqneg.s16 \r1, \r1 // out1 + + vmull_vmlal q9, d4, d6, d2[0], d2[0] // -> out3 (d19 or d20) + vmull_vmlsl q4, d4, d6, d2[0], d2[0] // -> out4 (d20 or d19) + vmull_vmlsl q10, d5, d7, d2[0], d2[0] // -> out5 (d21 or d18) + vrshrn.i32 d4, q9, #12 // out3 + vmull_vmlal q9, d5, d7, d2[0], d2[0] // -> out2 (d18 or d21) + vrshrn.i32 d5, q10, #12 // out5 + vrshrn.i32 \r2, q9, #12 // out2 (d18 or d21) + vrshrn.i32 \r4, q4, #12 // out4 (d20 or d19) + + vqneg.s16 \r3, d4 // out3 + vqneg.s16 \r5, d5 // out5 +.endm + +function inv_adst_8h_x8_neon, export=1 + iadst_8h_x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_8h_x8_neon, export=1 + iadst_8h_x8 q15, q14, q13, q12, q11, q10, q9, q8, d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17 + bx lr +endfunc + +function inv_adst_4h_x8_neon, export=1 + iadst_4h_x8 d16, d17, d18, d19, d20, d21, d22, d23 + bx lr +endfunc + +function inv_flipadst_4h_x8_neon, export=1 + iadst_4h_x8 d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_8h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + vqshl.s16 q12, q12, #1 + vqshl.s16 q13, q13, #1 + vqshl.s16 q14, q14, #1 + vqshl.s16 q15, q15, #1 + bx lr +endfunc + +function inv_identity_4h_x8_neon, export=1 + vqshl.s16 q8, q8, #1 + vqshl.s16 q9, q9, #1 + vqshl.s16 q10, q10, #1 + vqshl.s16 q11, q11, #1 + bx lr +endfunc + +.macro def_fn_8x8_base variant +function inv_txfm_\variant\()add_8x8_neon + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q12, q13}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128]! + vld1.16 {q14, q15}, [r2, :128] + vst1.16 {q0, q1}, [r2, :128] + +.ifc \variant, identity_ + // The identity shl #1 and downshift srshr #1 cancel out +.else + blx r4 + + vrshr.s16 q8, q8, #1 + vrshr.s16 q9, q9, #1 + vrshr.s16 q10, q10, #1 + vrshr.s16 q11, q11, #1 + vrshr.s16 q12, q12, #1 + vrshr.s16 q13, q13, #1 + vrshr.s16 q14, q14, #1 + vrshr.s16 q15, q15, #1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + blx r5 + + load_add_store_8x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc +.endm + +def_fn_8x8_base +def_fn_8x8_base identity_ + +.macro def_fn_8x8 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.ifc \txfm1, identity + b inv_txfm_identity_add_8x8_neon +.else + movrel_local r4, inv_\txfm1\()_8h_x8_neon + b inv_txfm_add_8x8_neon +.endif +endfunc +.endm + +def_fn_8x8 dct, dct +def_fn_8x8 identity, identity +def_fn_8x8 dct, adst +def_fn_8x8 dct, flipadst +def_fn_8x8 dct, identity +def_fn_8x8 adst, dct +def_fn_8x8 adst, adst +def_fn_8x8 adst, flipadst +def_fn_8x8 flipadst, dct +def_fn_8x8 flipadst, adst +def_fn_8x8 flipadst, flipadst +def_fn_8x8 identity, dct +def_fn_8x8 adst, identity +def_fn_8x8 flipadst, identity +def_fn_8x8 identity, adst +def_fn_8x8 identity, flipadst + +function inv_txfm_add_8x4_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + vswp d17, d20 + vswp d19, d21 + vswp d18, d20 + vswp d21, d22 + + blx r5 + + load_add_store_8x4 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +function inv_txfm_add_4x8_neon + vmov.i16 q14, #0 + vmov.i16 q15, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + vld1.16 {q8, q9}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128]! + vld1.16 {q10, q11}, [r2, :128] + vst1.16 {q14, q15}, [r2, :128] + + scale_input d0[0], q8, q9, q10, q11 + + blx r4 + + transpose_4x8h q8, q9, q10, q11 + vswp d17, d20 + vswp d19, d21 + vswp d17, d18 + vswp d19, d22 + + blx r5 + + load_add_store_4x8 r0, r7 + vpop {q4-q7} + pop {r4-r5,r7,pc} +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + push {r4-r5,r7,lr} + vpush {q4-q7} + movrel_local r4, inv_\txfm1\()_\h\()h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_\w\()h_x\h\()_neon + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct +def_fn_48 \w, \h, identity, identity +def_fn_48 \w, \h, dct, adst +def_fn_48 \w, \h, dct, flipadst +def_fn_48 \w, \h, dct, identity +def_fn_48 \w, \h, adst, dct +def_fn_48 \w, \h, adst, adst +def_fn_48 \w, \h, adst, flipadst +def_fn_48 \w, \h, flipadst, dct +def_fn_48 \w, \h, flipadst, adst +def_fn_48 \w, \h, flipadst, flipadst +def_fn_48 \w, \h, identity, dct +def_fn_48 \w, \h, adst, identity +def_fn_48 \w, \h, flipadst, identity +def_fn_48 \w, \h, identity, adst +def_fn_48 \w, \h, identity, flipadst +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + +function inv_dct_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs + vld1.16 {q0, q1}, [r12, :128] + + vmull_vmlsl q2, d17, d31, d2[0], d2[1] // -> t8a + vmull_vmlal q3, d17, d31, d2[1], d2[0] // -> t15a + vmull_vmlsl q4, d25, d23, d2[2], d2[3] // -> t9a + vrshrn.i32 d17, q2, #12 // t8a + vrshrn.i32 d31, q3, #12 // t15a + vmull_vmlal q2, d25, d23, d2[3], d2[2] // -> t14a + vmull_vmlsl q3, d21, d27, d3[0], d3[1] // -> t10a + vrshrn.i32 d23, q4, #12 // t9a + vrshrn.i32 d25, q2, #12 // t14a + vmull_vmlal q4, d21, d27, d3[1], d3[0] // -> t13a + vmull_vmlsl q2, d29, d19, d3[2], d3[3] // -> t11a + vrshrn.i32 d21, q3, #12 // t10a + vrshrn.i32 d27, q4, #12 // t13a + vmull_vmlal q3, d29, d19, d3[3], d3[2] // -> t12a + vrshrn.i32 d19, q2, #12 // t11a + vrshrn.i32 d29, q3, #12 // t12a + + idct_4h_x8 d16, d18, d20, d22, d24, d26, d28, d30 + + vqsub.s16 d4, d17, d23 // t9 + vqadd.s16 d17, d17, d23 // t8 + vqsub.s16 d5, d31, d25 // t14 + vqadd.s16 d31, d31, d25 // t15 + vqsub.s16 d23, d19, d21 // t10 + vqadd.s16 d19, d19, d21 // t11 + vqadd.s16 d25, d29, d27 // t12 + vqsub.s16 d29, d29, d27 // t13 + + vmull_vmlsl q3, d5, d4, d0[2], d0[3] // -> t9a + vmull_vmlal q4, d5, d4, d0[3], d0[2] // -> t14a + vrshrn.i32 d21, q3, #12 // t9a + vrshrn.i32 d27, q4, #12 // t14a + + vmull_vmlsl q3, d29, d23, d0[2], d0[3] // -> t13a + vmull_vmlal q4, d29, d23, d0[3], d0[2] // -> t10a + vrshrn.i32 d29, q3, #12 // t13a + vneg.s32 q4, q4 + vrshrn.i32 d23, q4, #12 // t10a + + vqsub.s16 d4, d17, d19 // t11a + vqadd.s16 d17, d17, d19 // t8a + vqsub.s16 d5, d31, d25 // t12a + vqadd.s16 d31, d31, d25 // t15a + vqadd.s16 d19, d21, d23 // t9 + vqsub.s16 d21, d21, d23 // t10 + vqsub.s16 d25, d27, d29 // t13 + vqadd.s16 d27, d27, d29 // t14 + + vmull_vmlsl q3, d5, d4, d0[0], d0[0] // -> t11 + vmull_vmlal q4, d5, d4, d0[0], d0[0] // -> t12 + vmull_vmlsl q2, d25, d21, d0[0], d0[0] // -> t10a + + vrshrn.i32 d6, q3, #12 // t11 + vrshrn.i32 d7, q4, #12 // t12 + vmull_vmlal q4, d25, d21, d0[0], d0[0] // -> t10a + vrshrn.i32 d4, q2, #12 // t10a + vrshrn.i32 d5, q4, #12 // t13a + + vqadd.s16 d8, d16, d31 // out0 + vqsub.s16 d31, d16, d31 // out15 + vmov d16, d8 + vqadd.s16 d23, d30, d17 // out7 + vqsub.s16 d9, d30, d17 // out8 + vqadd.s16 d17, d18, d27 // out1 + vqsub.s16 d30, d18, d27 // out14 + vqadd.s16 d18, d20, d5 // out2 + vqsub.s16 d29, d20, d5 // out13 + vqadd.s16 d5, d28, d19 // out6 + vqsub.s16 d25, d28, d19 // out9 + vqadd.s16 d19, d22, d7 // out3 + vqsub.s16 d28, d22, d7 // out12 + vqadd.s16 d20, d24, d6 // out4 + vqsub.s16 d27, d24, d6 // out11 + vqadd.s16 d21, d26, d4 // out5 + vqsub.s16 d26, d26, d4 // out10 + vmov d24, d9 + vmov d22, d5 + + bx lr +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel_local r12, iadst16_coeffs + vld1.16 {q0, q1}, [r12, :128] + movrel_local r12, idct_coeffs + + vmull_vmlal q2, d31, d16, d0[0], d0[1] // -> t0 + vmull_vmlsl q3, d31, d16, d0[1], d0[0] // -> t1 + vmull_vmlal q4, d29, d18, d0[2], d0[3] // -> t2 + vrshrn.i32 d16, q2, #12 // t0 + vrshrn.i32 d31, q3, #12 // t1 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t3 + vmull_vmlal q3, d27, d20, d1[0], d1[1] // -> t4 + vrshrn.i32 d18, q4, #12 // t2 + vrshrn.i32 d29, q2, #12 // t3 + vmull_vmlsl q4, d27, d20, d1[1], d1[0] // -> t5 + vmull_vmlal q2, d25, d22, d1[2], d1[3] // -> t6 + vrshrn.i32 d20, q3, #12 // t4 + vrshrn.i32 d27, q4, #12 // t5 + vmull_vmlsl q3, d25, d22, d1[3], d1[2] // -> t7 + vmull_vmlal q4, d23, d24, d2[0], d2[1] // -> t8 + vrshrn.i32 d22, q2, #12 // t6 + vrshrn.i32 d25, q3, #12 // t7 + vmull_vmlsl q2, d23, d24, d2[1], d2[0] // -> t9 + vmull_vmlal q3, d21, d26, d2[2], d2[3] // -> t10 + vrshrn.i32 d23, q4, #12 // t8 + vrshrn.i32 d24, q2, #12 // t9 + vmull_vmlsl q4, d21, d26, d2[3], d2[2] // -> t11 + vmull_vmlal q2, d19, d28, d3[0], d3[1] // -> t12 + vrshrn.i32 d21, q3, #12 // t10 + vrshrn.i32 d26, q4, #12 // t11 + vmull_vmlsl q3, d19, d28, d3[1], d3[0] // -> t13 + vmull_vmlal q4, d17, d30, d3[2], d3[3] // -> t14 + vrshrn.i32 d19, q2, #12 // t12 + vrshrn.i32 d28, q3, #12 // t13 + vmull_vmlsl q2, d17, d30, d3[3], d3[2] // -> t15 + vrshrn.i32 d17, q4, #12 // t14 + vrshrn.i32 d30, q2, #12 // t15 + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d23 // t8a + vqadd.s16 d16, d16, d23 // t0a + vqsub.s16 d3, d31, d24 // t9a + vqadd.s16 d31, d31, d24 // t1a + vqadd.s16 d23, d18, d21 // t2a + vqsub.s16 d18, d18, d21 // t10a + vqadd.s16 d24, d29, d26 // t3a + vqsub.s16 d29, d29, d26 // t11a + vqadd.s16 d21, d20, d19 // t4a + vqsub.s16 d20, d20, d19 // t12a + vqadd.s16 d26, d27, d28 // t5a + vqsub.s16 d27, d27, d28 // t13a + vqadd.s16 d19, d22, d17 // t6a + vqsub.s16 d22, d22, d17 // t14a + vqadd.s16 d28, d25, d30 // t7a + vqsub.s16 d25, d25, d30 // t15a + + vmull_vmlal q2, d2, d3, d1[1], d1[0] // -> t8 + vmull_vmlsl q3, d2, d3, d1[0], d1[1] // -> t9 + vmull_vmlal q4, d18, d29, d1[3], d1[2] // -> t10 + vrshrn.i32 d17, q2, #12 // t8 + vrshrn.i32 d30, q3, #12 // t9 + vmull_vmlsl q2, d18, d29, d1[2], d1[3] // -> t11 + vmull_vmlsl q3, d27, d20, d1[1], d1[0] // -> t12 + vrshrn.i32 d18, q4, #12 // t10 + vrshrn.i32 d29, q2, #12 // t11 + vmull_vmlal q4, d27, d20, d1[0], d1[1] // -> t13 + vmull_vmlsl q2, d25, d22, d1[3], d1[2] // -> t14 + vrshrn.i32 d27, q3, #12 // t12 + vrshrn.i32 d20, q4, #12 // t13 + vmull_vmlal q3, d25, d22, d1[2], d1[3] // -> t15 + vrshrn.i32 d25, q2, #12 // t14 + vrshrn.i32 d22, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t4 + vqadd.s16 d16, d16, d21 // t0 + vqsub.s16 d3, d31, d26 // t5 + vqadd.s16 d31, d31, d26 // t1 + vqadd.s16 d21, d23, d19 // t2 + vqsub.s16 d23, d23, d19 // t6 + vqadd.s16 d26, d24, d28 // t3 + vqsub.s16 d24, d24, d28 // t7 + vqadd.s16 d19, d17, d27 // t8a + vqsub.s16 d17, d17, d27 // t12a + vqadd.s16 d28, d30, d20 // t9a + vqsub.s16 d30, d30, d20 // t13a + vqadd.s16 d27, d18, d25 // t10a + vqsub.s16 d18, d18, d25 // t14a + vqadd.s16 d20, d29, d22 // t11a + vqsub.s16 d29, d29, d22 // t15a + + vmull_vmlal q2, d2, d3, d0[3], d0[2] // -> t4a + vmull_vmlsl q3, d2, d3, d0[2], d0[3] // -> t5a + vmull_vmlsl q4, d24, d23, d0[3], d0[2] // -> t6a + vrshrn.i32 d22, q2, #12 // t4a + vrshrn.i32 d25, q3, #12 // t5a + vmull_vmlal q2, d24, d23, d0[2], d0[3] // -> t7a + vmull_vmlal q3, d17, d30, d0[3], d0[2] // -> t12 + vrshrn.i32 d24, q4, #12 // t6a + vrshrn.i32 d23, q2, #12 // t7a + vmull_vmlsl q4, d17, d30, d0[2], d0[3] // -> t13 + vmull_vmlsl q2, d29, d18, d0[3], d0[2] // -> t14 + vrshrn.i32 d17, q3, #12 // t12 + vmull_vmlal q3, d29, d18, d0[2], d0[3] // -> t15 + vrshrn.i32 d29, q4, #12 // t13 + vrshrn.i32 d30, q2, #12 // t14 + vrshrn.i32 d18, q3, #12 // t15 + + vqsub.s16 d2, d16, d21 // t2a +.ifc \o0, d16 + vqadd.s16 \o0, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 +.else + vqadd.s16 d4, d16, d21 // out0 + vqsub.s16 d21, d31, d26 // t3a + vqadd.s16 \o15,d31, d26 // out15 + vmov \o0, d4 +.endif + vqneg.s16 \o15, \o15 // out15 + + vqsub.s16 d3, d29, d18 // t15a + vqadd.s16 \o13,d29, d18 // out13 + vqadd.s16 \o2, d17, d30 // out2 + vqsub.s16 d26, d17, d30 // t14a + vqneg.s16 \o13,\o13 // out13 + + vqadd.s16 \o1, d19, d27 // out1 + vqsub.s16 d27, d19, d27 // t10 + vqadd.s16 \o14,d28, d20 // out14 + vqsub.s16 d20, d28, d20 // t11 + vqneg.s16 \o1, \o1 // out1 + + vqadd.s16 \o3, d22, d24 // out3 + vqsub.s16 d22, d22, d24 // t6 + vqadd.s16 \o12,d25, d23 // out12 + vqsub.s16 d23, d25, d23 // t7 + vqneg.s16 \o3, \o3 // out3 + + vmull_vmlsl q12, d2, d21, d0[0], d0[0] // -> out8 (d24 or d23) + vmull_vmlal q2, d2, d21, d0[0], d0[0] // -> out7 (d23 or d24) + vmull_vmlal q3, d26, d3, d0[0], d0[0] // -> out5 (d21 or d26) + + vrshrn.i32 d24, q12, #12 // out8 + vrshrn.i32 d4, q2, #12 // out7 + vrshrn.i32 d5, q3, #12 // out5 + vmull_vmlsl q4, d26, d3, d0[0], d0[0] // -> out10 (d26 or d21) + vmull_vmlal q1, d22, d23, d0[0], d0[0] // -> out4 (d20 or d27) + vrshrn.i32 d26, q4, #12 // out10 + + vmull_vmlsl q4, d22, d23, d0[0], d0[0] // -> out11 (d27 or d20) + vmull_vmlal q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25) + vmull_vmlsl q3, d27, d20, d0[0], d0[0] // -> out9 (d25 or d22) + + vrshrn.i32 \o4, q1, #12 // out4 + vrshrn.i32 d7, q3, #12 // out9 + vrshrn.i32 d6, q4, #12 // out11 + vrshrn.i32 \o6, q11, #12 // out6 + +.ifc \o8, d23 + vmov \o8, d24 + vmov \o10,d26 +.endif + + vqneg.s16 \o7, d4 // out7 + vqneg.s16 \o5, d5 // out5 + vqneg.s16 \o11,d6 // out11 + vqneg.s16 \o9, d7 // out9 +.endm + +function inv_adst_4h_x16_neon, export=1 + iadst_16 d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + bx lr +endfunc + +function inv_flipadst_4h_x16_neon, export=1 + iadst_16 d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 + bx lr +endfunc + +function inv_identity_4h_x16_neon, export=1 + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q1, \i, d0[0] + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q1 +.endr + bx lr +endfunc + +.macro identity_4x16_shift2 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vshr.s16 q2, q2, #1 + vrhadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_4x16_shift1 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vrshr.s16 q2, q2, #1 + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro identity_8x8_shift1 c + identity_4x16_shift1 \c +.endm + +.macro identity_8x8 c +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vqrdmulh.s16 q2, \i, \c + vqadd.s16 \i, \i, \i + vqadd.s16 \i, \i, q2 +.endr +.endm + +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + push {lr} + vmov.i16 d7, #0 +.if \identity + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.endif +.if \scale + movw r12, #2896*8 + vdup.16 d1, r12 +.endif +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + scale_input d1[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif +.if \identity +.if \shift == -2 + identity_4x16_shift2 d0[0] +.else + identity_4x16_shift1 d0[0] +.endif +.else + blx r4 +.endif +.if \shift > 0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #\shift +.endr +.endif + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31 + vst1.16 {\i}, [r6, :64]! +.endr + + pop {pc} +endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity +def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity + +function inv_txfm_add_vert_4x16_neon + push {lr} +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + blx r5 + load_add_store_4x16 r6, r7 + pop {pc} +endfunc + +.macro sub_sp_align space +#if CONFIG_THUMB + mov r7, sp + and r7, r7, #15 +#else + and r7, sp, #15 +#endif + sub sp, sp, r7 + // Now the stack is aligned, store the amount of adjustment back + // on the stack, as we don't want to waste a register as frame + // pointer. + str r7, [sp, #-16]! +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub r7, sp, #4096 + ldr r12, [r7] + sub r7, r7, #(\space - 4096) + mov sp, r7 +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +.macro add_sp_align space +.if \space >= 4096 + add sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + add sp, sp, #(\space)%4096 +.endif + ldr r7, [sp], #16 + // Add back the original stack adjustment + add sp, sp, r7 +.endm + +function inv_txfm_add_16x16_neon + sub_sp_align 512 + ldrh r11, [r10], #2 +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*16*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + blx r9 +.endr + b 3f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 512 + vpop {q4} + pop {r4-r11,pc} +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + push {r4-r11,lr} + vpush {q4} +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_identity_16x4_neon +.else + movrel_local r9, inv_txfm_horz_16x4_neon + movrel_local r4, inv_\txfm1\()_4h_x16_neon +.endif + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel_local r10, eob_16x16 +.else + movrel_local r10, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel_local r10, eob_16x16_identity +.else + movrel_local r10, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +.macro def_fn_416_base variant +function inv_txfm_\variant\()add_16x4_neon + +.ifc \variant, identity_ + vmov.i16 d4, #0 +.irp i, d16, d18, d20, d22 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d17, d19, d21, d23 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + movw r12, #2*(5793-4096)*8 + vdup.16 d0, r12 +.irp i, d24, d26, d28, d30 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr +.irp i, d25, d27, d29, d31 + vld1.16 {\i}, [r2, :64] + vst1.16 {d4}, [r2, :64]! +.endr + + identity_4x16_shift1 d0[0] +.else + vmov.i16 q2, #0 + vmov.i16 q3, #0 + vld1.16 {d16, d17, d18, d19}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d20, d21, d22, d23}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d24, d25, d26, d27}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + vld1.16 {d28, d29, d30, d31}, [r2, :128] + vst1.16 {q2, q3}, [r2, :128]! + + blx r4 + + vswp d17, d20 + vswp d19, d22 + vswp d18, d20 + vswp d19, d21 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + mov r6, r0 + load_add_store_8x4 r6, r7 + +.ifc \variant, identity_ + vmov q8, q12 + vmov q9, q13 + vmov q10, q14 + vmov q11, q15 +.else + vswp d25, d28 + vswp d27, d30 + vswp d26, d28 + vswp d27, d29 + vrshr.s16 q8, q12, #1 + vrshr.s16 q9, q13, #1 + vrshr.s16 q10, q14, #1 + vrshr.s16 q11, q15, #1 +.endif + transpose_4x8h q8, q9, q10, q11 + blx r5 + add r6, r0, #8 + load_add_store_8x4 r6, r7 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_4x16_neon + vmov.i16 q2, #0 + + mov r11, #32 + cmp r3, r10 + blt 1f + + add r6, r2, #16 +.ifc \variant, identity_ +.irp i, q12, q13, q14, q15 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q12, q13, q14, q15, d0[0] +.else +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r6, :128] + vst1.16 {q2}, [r6, :128], r11 +.endr + blx r4 + vrshr.s16 q12, q8, #1 + vrshr.s16 q13, q9, #1 + vrshr.s16 q14, q10, #1 + vrshr.s16 q15, q11, #1 +.endif + transpose_4x8h q12, q13, q14, q15 + vswp d27, d29 + vswp d26, d28 + vswp d27, d30 + vswp d25, d28 + + b 2f +1: +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +2: + vmov.i16 q2, #0 +.irp i, q8, q9, q10, q11 + vld1.16 {\i}, [r2, :128] + vst1.16 {q2}, [r2, :128], r11 +.endr +.ifc \variant, identity_ + movw r12, #(5793-4096)*8 + vdup.16 d0, r12 + identity_8x4_shift1 q8, q9, q10, q11, d0[0] +.else + blx r4 +.irp i, q8, q9, q10, q11 + vrshr.s16 \i, \i, #1 +.endr +.endif + transpose_4x8h q8, q9, q10, q11 + vswp d19, d21 + vswp d18, d20 + vswp d19, d22 + vswp d17, d20 + + blx r5 + + load_add_store_4x16 r0, r6 + + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_416_base +def_fn_416_base identity_ + +.macro def_fn_416 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 4 + movrel_local r4, inv_\txfm1\()_8h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_4h_x\h\()_neon + mov r10, #\eob_half +.else + movrel_local r4, inv_\txfm1\()_4h_x\w\()_neon + movrel_local r5, inv_\txfm2\()_8h_x\h\()_neon +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct, 29 +def_fn_416 \w, \h, identity, identity, 29 +def_fn_416 \w, \h, dct, adst, 29 +def_fn_416 \w, \h, dct, flipadst, 29 +def_fn_416 \w, \h, dct, identity, 8 +def_fn_416 \w, \h, adst, dct, 29 +def_fn_416 \w, \h, adst, adst, 29 +def_fn_416 \w, \h, adst, flipadst, 29 +def_fn_416 \w, \h, flipadst, dct, 29 +def_fn_416 \w, \h, flipadst, adst, 29 +def_fn_416 \w, \h, flipadst, flipadst, 29 +def_fn_416 \w, \h, identity, dct, 32 +def_fn_416 \w, \h, adst, identity, 8 +def_fn_416 \w, \h, flipadst, identity, 8 +def_fn_416 \w, \h, identity, adst, 32 +def_fn_416 \w, \h, identity, flipadst, 32 +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + +.macro def_fn_816_base variant +function inv_txfm_\variant\()add_16x8_neon + sub_sp_align 256 + +.irp i, 0, 4 + add r6, sp, #(\i*16*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #8*2 + blx r9 +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 8 + add r7, sp, #(\i*2) + mov r8, #32 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128], r8 +.endr + blx r5 + + add r6, r0, #(\i) + load_add_store_8x8 r6, r7 +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_\variant\()add_8x16_neon + sub_sp_align 256 + +.irp i, 0, 8 + add r6, sp, #(\i*8*2) +.if \i > 0 + cmp r3, r10 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + + vmov.i16 q2, #0 + movw r12, #2896*8 + vdup.16 d0, r12 + +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\j}, [r7, :128] + vst1.16 {q2}, [r7, :128], r8 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.ifc \variant, identity_ + // The identity shl #1 and downshift vrshr #1 cancel out +.else + blx r4 +.irp j, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \j, \j, #1 +.endr +.endif + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + vst1.16 {q8, q9}, [r6, :128]! + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! +.endr + b 2f +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr +2: + +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 256 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc +.endm + +def_fn_816_base +def_fn_816_base identity_ + +.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + push {r4-r11,lr} + vpush {q4-q7} +.if \w == 8 + movrel_local r4, inv_\txfm1\()_8h_x8_neon + movrel_local r5, inv_\txfm2\()_4h_x16_neon +.else +.ifc \txfm1, identity + movrel_local r9, inv_txfm_horz_scale_identity_16x4_neon +.else + movrel_local r4, inv_\txfm1\()_4h_x16_neon + movrel_local r9, inv_txfm_horz_scale_16x4_neon +.endif + movrel_local r5, inv_\txfm2\()_8h_x8_neon +.endif +.if \w == 8 + mov r10, #\eob_8x8 +.else + mov r10, #\eob_4x4 +.endif +.ifc \txfm1, identity + b inv_txfm_identity_add_\w\()x\h\()_neon +.else + b inv_txfm_add_\w\()x\h\()_neon +.endif +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct, 43, 10 +def_fn_816 \w, \h, identity, identity, 43, 10 +def_fn_816 \w, \h, dct, adst, 43, 10 +def_fn_816 \w, \h, dct, flipadst, 43, 10 +def_fn_816 \w, \h, dct, identity, 8, 4 +def_fn_816 \w, \h, adst, dct, 43, 10 +def_fn_816 \w, \h, adst, adst, 43, 10 +def_fn_816 \w, \h, adst, flipadst, 43, 10 +def_fn_816 \w, \h, flipadst, dct, 43, 10 +def_fn_816 \w, \h, flipadst, adst, 43, 10 +def_fn_816 \w, \h, flipadst, flipadst, 43, 10 +def_fn_816 \w, \h, identity, dct, 64, 4 +def_fn_816 \w, \h, adst, identity, 8, 4 +def_fn_816 \w, \h, flipadst, identity, 8, 4 +def_fn_816 \w, \h, identity, adst, 64, 4 +def_fn_816 \w, \h, identity, flipadst, 64, 4 +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4h_x16_neon, export=1 + movrel_local r12, idct_coeffs, 2*16 + vld1.16 {q0, q1}, [r12, :128] + sub r12, r12, #2*16 + + vmull_vmlsl q2, d16, d31, d0[0], d0[1] // -> t16a + vmull_vmlal q3, d16, d31, d0[1], d0[0] // -> t31a + vmull_vmlsl q4, d24, d23, d0[2], d0[3] // -> t17a + vrshrn.i32 d16, q2, #12 // t16a + vrshrn.i32 d31, q3, #12 // t31a + vmull_vmlal q2, d24, d23, d0[3], d0[2] // -> t30a + vmull_vmlsl q3, d20, d27, d1[0], d1[1] // -> t18a + vrshrn.i32 d24, q4, #12 // t17a + vrshrn.i32 d23, q2, #12 // t30a + vmull_vmlal q4, d20, d27, d1[1], d1[0] // -> t29a + vmull_vmlsl q2, d28, d19, d1[2], d1[3] // -> t19a + vrshrn.i32 d20, q3, #12 // t18a + vrshrn.i32 d27, q4, #12 // t29a + vmull_vmlal q3, d28, d19, d1[3], d1[2] // -> t28a + vmull_vmlsl q4, d18, d29, d2[0], d2[1] // -> t20a + vrshrn.i32 d28, q2, #12 // t19a + vrshrn.i32 d19, q3, #12 // t28a + vmull_vmlal q2, d18, d29, d2[1], d2[0] // -> t27a + vmull_vmlsl q3, d26, d21, d2[2], d2[3] // -> t21a + vrshrn.i32 d18, q4, #12 // t20a + vrshrn.i32 d29, q2, #12 // t27a + vmull_vmlal q4, d26, d21, d2[3], d2[2] // -> t26a + vmull_vmlsl q2, d22, d25, d3[0], d3[1] // -> t22a + vrshrn.i32 d26, q3, #12 // t21a + vrshrn.i32 d21, q4, #12 // t26a + vmull_vmlal q3, d22, d25, d3[1], d3[0] // -> t25a + vmull_vmlsl q4, d30, d17, d3[2], d3[3] // -> t23a + vrshrn.i32 d22, q2, #12 // t22a + vrshrn.i32 d25, q3, #12 // t25a + vmull_vmlal q2, d30, d17, d3[3], d3[2] // -> t24a + vrshrn.i32 d30, q4, #12 // t23a + vrshrn.i32 d17, q2, #12 // t24a + + vld1.16 {q0}, [r12, :128] + + vqsub.s16 d2, d16, d24 // t17 + vqadd.s16 d16, d16, d24 // t16 + vqsub.s16 d3, d31, d23 // t30 + vqadd.s16 d31, d31, d23 // t31 + vqsub.s16 d24, d28, d20 // t18 + vqadd.s16 d28, d28, d20 // t19 + vqadd.s16 d23, d18, d26 // t20 + vqsub.s16 d18, d18, d26 // t21 + vqsub.s16 d20, d30, d22 // t22 + vqadd.s16 d30, d30, d22 // t23 + vqadd.s16 d26, d17, d25 // t24 + vqsub.s16 d17, d17, d25 // t25 + vqsub.s16 d22, d29, d21 // t26 + vqadd.s16 d29, d29, d21 // t27 + vqadd.s16 d25, d19, d27 // t28 + vqsub.s16 d19, d19, d27 // t29 + + vmull_vmlsl q2, d3, d2, d1[0], d1[1] // -> t17a + vmull_vmlal q3, d3, d2, d1[1], d1[0] // -> t30a + vmull_vmlal q4, d19, d24, d1[1], d1[0] // -> t18a + vrshrn.i32 d21, q2, #12 // t17a + vrshrn.i32 d27, q3, #12 // t30a + vneg.s32 q4, q4 // -> t18a + vmull_vmlsl q1, d19, d24, d1[0], d1[1] // -> t29a + vmull_vmlsl q2, d22, d18, d1[2], d1[3] // -> t21a + vrshrn.i32 d19, q4, #12 // t18a + vrshrn.i32 d24, q1, #12 // t29a + vmull_vmlal q3, d22, d18, d1[3], d1[2] // -> t26a + vmull_vmlal q4, d17, d20, d1[3], d1[2] // -> t22a + vrshrn.i32 d22, q2, #12 // t21a + vrshrn.i32 d18, q3, #12 // t26a + vneg.s32 q4, q4 // -> t22a + vmull_vmlsl q1, d17, d20, d1[2], d1[3] // -> t25a + vrshrn.i32 d17, q4, #12 // t22a + vrshrn.i32 d20, q1, #12 // t25a + + vqsub.s16 d2, d27, d24 // t29 + vqadd.s16 d27, d27, d24 // t30 + vqsub.s16 d3, d21, d19 // t18 + vqadd.s16 d21, d21, d19 // t17 + vqsub.s16 d24, d16, d28 // t19a + vqadd.s16 d16, d16, d28 // t16a + vqsub.s16 d19, d30, d23 // t20a + vqadd.s16 d30, d30, d23 // t23a + vqsub.s16 d28, d17, d22 // t21 + vqadd.s16 d17, d17, d22 // t22 + vqadd.s16 d23, d26, d29 // t24a + vqsub.s16 d26, d26, d29 // t27a + vqadd.s16 d22, d20, d18 // t25 + vqsub.s16 d20, d20, d18 // t26 + vqsub.s16 d29, d31, d25 // t28a + vqadd.s16 d31, d31, d25 // t31a + + vmull_vmlsl q2, d2, d3, d0[2], d0[3] // -> t18a + vmull_vmlal q3, d2, d3, d0[3], d0[2] // -> t29a + vmull_vmlsl q4, d29, d24, d0[2], d0[3] // -> t19 + vrshrn.i32 d18, q2, #12 // t18a + vrshrn.i32 d25, q3, #12 // t29a + vmull_vmlal q1, d29, d24, d0[3], d0[2] // -> t28 + vmull_vmlal q2, d26, d19, d0[3], d0[2] // -> t20 + vrshrn.i32 d29, q4, #12 // t19 + vrshrn.i32 d24, q1, #12 // t28 + vneg.s32 q2, q2 // -> t20 + vmull_vmlsl q3, d26, d19, d0[2], d0[3] // -> t27 + vmull_vmlal q4, d20, d28, d0[3], d0[2] // -> t21a + vrshrn.i32 d26, q2, #12 // t20 + vrshrn.i32 d19, q3, #12 // t27 + vneg.s32 q4, q4 // -> t21a + vmull_vmlsl q1, d20, d28, d0[2], d0[3] // -> t26a + vrshrn.i32 d20, q4, #12 // t21a + vrshrn.i32 d28, q1, #12 // t26a + + vqsub.s16 d2, d16, d30 // t23 + vqadd.s16 d16, d16, d30 // t16 = out16 + vqsub.s16 d3, d31, d23 // t24 + vqadd.s16 d31, d31, d23 // t31 = out31 + vqsub.s16 d23, d21, d17 // t22a + vqadd.s16 d17, d21, d17 // t17a = out17 + vqadd.s16 d30, d27, d22 // t30a = out30 + vqsub.s16 d21, d27, d22 // t25a + vqsub.s16 d27, d18, d20 // t21 + vqadd.s16 d18, d18, d20 // t18 = out18 + vqadd.s16 d4, d29, d26 // t19a = out19 + vqsub.s16 d26, d29, d26 // t20a + vqadd.s16 d29, d25, d28 // t29 = out29 + vqsub.s16 d25, d25, d28 // t26 + vqadd.s16 d28, d24, d19 // t28a = out28 + vqsub.s16 d24, d24, d19 // t27a + vmov d19, d4 // out19 + + vmull_vmlsl q2, d24, d26, d0[0], d0[0] // -> t20 + vmull_vmlal q3, d24, d26, d0[0], d0[0] // -> t27 + vrshrn.i32 d20, q2, #12 // t20 + vrshrn.i32 d22, q3, #12 // t27 + + vmull_vmlal q2, d25, d27, d0[0], d0[0] // -> t26a + vmull_vmlsl q3, d25, d27, d0[0], d0[0] // -> t21a + vmov d27, d22 // t27 + vrshrn.i32 d26, q2, #12 // t26a + + vmull_vmlsl q12, d21, d23, d0[0], d0[0] // -> t22 + vmull_vmlal q2, d21, d23, d0[0], d0[0] // -> t25 + vrshrn.i32 d21, q3, #12 // t21a + vrshrn.i32 d22, q12, #12 // t22 + vrshrn.i32 d25, q2, #12 // t25 + + vmull_vmlsl q2, d3, d2, d0[0], d0[0] // -> t23a + vmull_vmlal q3, d3, d2, d0[0], d0[0] // -> t24a + vrshrn.i32 d23, q2, #12 // t23a + vrshrn.i32 d24, q3, #12 // t24a + + bx lr +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + push {lr} + vmov.i16 d7, #0 + lsl r8, r8, #1 +.if \scale + movw r12, #2896*8 + vdup.16 d0, r12 +.endif + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 +.if \scale + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct_4h_x16_neon + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q12, q13, d24, d25, d26, d27 + transpose_4x4h q14, q15, d28, d29, d30, d31 + +.macro store1 r0, r1, r2, r3 + vst1.16 {\r0}, [r6, :64]! + vst1.16 {\r1}, [r6, :64]! + vst1.16 {\r2}, [r6, :64]! + vst1.16 {\r3}, [r6, :64]! + add r6, r6, #32 +.endm + store1 d16, d20, d24, d28 + store1 d17, d21, d25, d29 + store1 d18, d22, d26, d30 + store1 d19, d23, d27, d31 +.purgem store1 + sub r6, r6, #64*4 + + vmov.i16 d7, #0 +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64] + vst1.16 {d7}, [r7, :64], r8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in d0[1] + scale_input d0[1], q8, q9, q10, q11, q12, q13, q14, q15 +.endif + bl inv_dct32_odd_4h_x16_neon + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q13, q12, d27, d26, d25, d24 + transpose_4x4h q11, q10, d23, d22, d21, d20 + transpose_4x4h q9, q8, d19, d18, d17, d16 +.macro store2 r0, r1, r2, r3, shift + vld1.16 {q0, q1}, [r6, :128] + vqsub.s16 d7, d0, \r0 + vqadd.s16 d0, d0, \r0 + vqsub.s16 d6, d1, \r1 + vqadd.s16 d1, d1, \r1 + vqsub.s16 d5, d2, \r2 + vqadd.s16 d2, d2, \r2 + vqsub.s16 d4, d3, \r3 + vqadd.s16 d3, d3, \r3 + vrev64.16 q2, q2 + vrev64.16 q3, q3 + vrshr.s16 q0, q0, #\shift + vrshr.s16 q1, q1, #\shift + vrshr.s16 q2, q2, #\shift + vrshr.s16 q3, q3, #\shift + vst1.16 {q0, q1}, [r6, :128]! + vst1.16 {q2, q3}, [r6, :128]! +.endm + + store2 d31, d27, d23, d19, \shift + store2 d30, d26, d22, d18, \shift + store2 d29, d25, d21, d17, \shift + store2 d28, d24, d20, d16, \shift +.purgem store2 + pop {pc} +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_4x32_neon + push {r10-r11,lr} + lsl r8, r8, #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + + bl inv_dct_4h_x16_neon + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vst1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + add r7, r7, r8, lsr #1 + +.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 + vld1.16 {\i}, [r7, :64], r8 +.endr + sub r7, r7, r8, lsl #4 + sub r7, r7, r8, lsr #1 + bl inv_dct32_odd_4h_x16_neon + + neg r9, r8 + mov r10, r6 +.macro combine r0, r1, r2, r3, op, stride + vld1.16 {d4}, [r7, :64], \stride + vld1.32 {d2[0]}, [r10, :32], r1 + vld1.16 {d5}, [r7, :64], \stride + vld1.32 {d2[1]}, [r10, :32], r1 + \op\().s16 d4, d4, \r0 + vld1.16 {d6}, [r7, :64], \stride + vld1.32 {d3[0]}, [r10, :32], r1 + \op\().s16 d5, d5, \r1 + vld1.32 {d3[1]}, [r10, :32], r1 + vrshr.s16 q2, q2, #4 + \op\().s16 d6, d6, \r2 + vld1.16 {d7}, [r7, :64], \stride + vaddw.u8 q2, q2, d2 + \op\().s16 d7, d7, \r3 + vqmovun.s16 d2, q2 + vrshr.s16 q3, q3, #4 + vst1.32 {d2[0]}, [r6, :32], r1 + vaddw.u8 q3, q3, d3 + vst1.32 {d2[1]}, [r6, :32], r1 + vqmovun.s16 d3, q3 + vst1.32 {d3[0]}, [r6, :32], r1 + vst1.32 {d3[1]}, [r6, :32], r1 +.endm + combine d31, d30, d29, d28, vqadd, r8 + combine d27, d26, d25, d24, vqadd, r8 + combine d23, d22, d21, d20, vqadd, r8 + combine d19, d18, d17, d16, vqadd, r8 + sub r7, r7, r8 + combine d16, d17, d18, d19, vqsub, r9 + combine d20, d21, d22, d23, vqsub, r9 + combine d24, d25, d26, d27, vqsub, r9 + combine d28, d29, d30, d31, vqsub, r9 +.purgem combine + + pop {r10-r11,pc} +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + // Contrary to the others, this one is only ever used in increments of 8x8 + .short 43, 107, 171, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 + push {r4-r7,lr} + vmov.i16 q0, #0 + movrel_local r5, eob_32x32, 2 + + mov r6, #2*32 +1: + mov r12, #0 + movrel_local r4, eob_32x32, 2 +2: + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r6 +.endr + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + load_add_store_8x8 r0, r7, shiftbits=2 + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc + +.macro shift_8_regs op, shift +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r7,lr} + movw r6, #2896*8 + movw r7, #2*(5793-4096)*8 + vdup.i16 d0, r6 + movrel_local r5, eob_16x32\hshort, 2 + vmov.16 d0[1], r7 + + mov r6, #2*\h +1: + mov r12, #0 + movrel_local r4, eob_16x32\wshort, 2 +2: + vmov.i16 q1, #0 + add r12, r12, #8 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q1}, [r2, :128], r6 +.endr + scale_input d0[0], q8, q9, q10, q11, q12, q13, q14, q15 + +.if \w == 16 + // 16x32 + identity_8x8_shift1 d0[1] +.else + // 32x16 + shift_8_regs vqshl.s16, 1 + identity_8x8 d0[1] +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + +.if \w == 16 + load_add_store_8x8 r0, r7, shiftbits=2 +.else + load_add_store_8x8 r0, r7, shiftbits=4 +.endif + ldrh lr, [r4], #4 + sub r0, r0, r1, lsl #3 + cmp r3, lr + add r0, r0, #8 + bge 2b + + ldrh lr, [r5], #4 + cmp r3, lr + blt 9f + + sub r0, r0, r12 + add r0, r0, r1, lsl #3 + mls r2, r6, r12, r2 + add r2, r2, #2*8 + b 1b +9: + pop {r4-r7,pc} +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 + push {r4-r5,lr} + vmov.i16 q0, #0 + movrel_local r4, eob_8x32 + + mov r12, #2*\h +1: + ldrh lr, [r4], #2 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r12 +.endr + +.if \w == 8 + // 8x32 + shift_8_regs vrshr.s16, 1 +.endif + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + cmp r3, lr +.if \w == 8 + load_add_store_8x8 r0, r5, shiftbits=2 +.else + load_add_store_8x8 r0, r5, shiftbits=3 +.endif + + blt 9f +.if \w == 8 + sub r2, r2, r12, lsl #3 + add r2, r2, #2*8 +.else + sub r0, r0, r1, lsl #3 + add r0, r0, #8 +.endif + b 1b + +9: + pop {r4-r5,pc} +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 + idct_dc 32, 32, 2 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 2048 + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 2048 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 + idct_dc 16, 32, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r4, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, sp, #(\i*16*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #16*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 + idct_dc 32, 16, 1 + + push {r4-r11,lr} + vpush {q4} + sub_sp_align 1024 + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + movrel_local r5, inv_dct_4h_x16_neon + +.irp i, 0, 4, 8, 12 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.if \i < 12 + ldrh r11, [r10], #2 +.endif +.endif + mov r8, #2*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #32*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 1024 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 + idct_dc 8, 32, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + + movrel_local r10, eob_8x32 + + mov r8, #2*32 + mov r9, #32 + mov r6, sp +1: + vmov.i16 q0, #0 +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r2, :128] + vst1.16 {q0}, [r2, :128], r8 +.endr + ldrh r11, [r10], #2 + sub r2, r2, r8, lsl #3 + sub r9, r9, #8 + add r2, r2, #2*8 + + bl inv_dct_8h_x8_neon + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vrshr.s16 \i, \i, #2 +.endr + + transpose_8x8h q8, q9, q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30 + + vst1.16 {q8, q9}, [r6, :128]! + cmp r3, r11 + vst1.16 {q10, q11}, [r6, :128]! + vst1.16 {q12, q13}, [r6, :128]! + vst1.16 {q14, q15}, [r6, :128]! + + bge 1b + cmp r9, #0 + beq 3f + + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r9, r9, #8 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4 + add r6, r0, #(\i) + add r7, sp, #(\i*2) + mov r8, #8*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 + idct_dc 32, 8, 2 + + push {r4-r11,lr} + vpush {q4-q7} + sub_sp_align 512 + +.irp i, 0, 4 + add r6, sp, #(\i*32*2) + add r7, r2, #(\i*2) +.if \i > 0 + cmp r3, #10 + blt 1f +.endif + mov r8, #8*2 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + +2: + mov r8, #2*32 + mov r9, #0 +1: + add r6, r0, r9 + add r7, sp, r9, lsl #1 // #(\i*2) + +.irp i, q8, q9, q10, q11, q12, q13, q14, q15 + vld1.16 {\i}, [r7, :128], r8 +.endr + add r9, r9, #8 + + bl inv_dct_8h_x8_neon + + cmp r9, #32 + + load_add_store_8x8 r6, r7 + + blt 1b + + add_sp_align 512 + vpop {q4-q7} + pop {r4-r11,pc} +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + vld1.16 {d0, d1, d2}, [r12, :64]! + + vqrdmulh.s16 d23, d16, d0[1] // t63a + vqrdmulh.s16 d16, d16, d0[0] // t32a + vqrdmulh.s16 d22, d17, d0[2] // t62a + vqrdmulh.s16 d17, d17, d0[3] // t33a + vqrdmulh.s16 d21, d18, d1[1] // t61a + vqrdmulh.s16 d18, d18, d1[0] // t34a + vqrdmulh.s16 d20, d19, d1[2] // t60a + vqrdmulh.s16 d19, d19, d1[3] // t35a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t33 + vqsub.s16 d26, d19, d18 // t34 + vqadd.s16 d27, d19, d18 // t35 + vqadd.s16 d28, d20, d21 // t60 + vqsub.s16 d29, d20, d21 // t61 + vqsub.s16 d30, d23, d22 // t62 + vqadd.s16 d31, d23, d22 // t63 + + vmull_vmlal q2, d29, d26, d2[0], d2[1] // -> t34a + vmull_vmlsl q3, d29, d26, d2[1], d2[0] // -> t61a + vneg.s32 q2, q2 // t34a + vmull_vmlsl q4, d30, d25, d2[1], d2[0] // -> t33a + vrshrn.i32 d26, q2, #12 // t34a + vmull_vmlal q2, d30, d25, d2[0], d2[1] // -> t62a + vrshrn.i32 d29, q3, #12 // t61a + vrshrn.i32 d25, q4, #12 // t33a + vrshrn.i32 d30, q2, #12 // t62a + + vqadd.s16 d16, d24, d27 // t32a + vqsub.s16 d19, d24, d27 // t35a + vqadd.s16 d17, d25, d26 // t33 + vqsub.s16 d18, d25, d26 // t34 + vqsub.s16 d20, d31, d28 // t60a + vqadd.s16 d23, d31, d28 // t63a + vqsub.s16 d21, d30, d29 // t61 + vqadd.s16 d22, d30, d29 // t62 + + vmull_vmlal q2, d21, d18, d2[2], d2[3] // -> t61a + vmull_vmlsl q3, d21, d18, d2[3], d2[2] // -> t34a + vmull_vmlal q4, d20, d19, d2[2], d2[3] // -> t60 + vrshrn.i32 d21, q2, #12 // t61a + vrshrn.i32 d18, q3, #12 // t34a + vmull_vmlsl q2, d20, d19, d2[3], d2[2] // -> t35 + vrshrn.i32 d20, q4, #12 // t60 + vrshrn.i32 d19, q2, #12 // t35 + + vst1.16 {d16, d17, d18, d19}, [r6, :128]! + vst1.16 {d20, d21, d22, d23}, [r6, :128]! + + bx lr +endfunc + +function inv_dct64_step2_neon + movrel_local r12, idct_coeffs + vld1.16 {d0}, [r12, :64] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + vldr d16, [r6, #2*4*0] // t32a + vldr d17, [r9, #2*4*8] // t39a + vldr d18, [r9, #2*4*0] // t63a + vldr d19, [r6, #2*4*8] // t56a + vldr d20, [r6, #2*4*16] // t40a + vldr d21, [r9, #2*4*24] // t47a + vldr d22, [r9, #2*4*16] // t55a + vldr d23, [r6, #2*4*24] // t48a + + vqadd.s16 d24, d16, d17 // t32 + vqsub.s16 d25, d16, d17 // t39 + vqadd.s16 d26, d18, d19 // t63 + vqsub.s16 d27, d18, d19 // t56 + vqsub.s16 d28, d21, d20 // t40 + vqadd.s16 d29, d21, d20 // t47 + vqadd.s16 d30, d23, d22 // t48 + vqsub.s16 d31, d23, d22 // t55 + + vmull_vmlal q2, d27, d25, d0[3], d0[2] // -> t56a + vmull_vmlsl q3, d27, d25, d0[2], d0[3] // -> t39a + vmull_vmlal q4, d31, d28, d0[3], d0[2] // -> t40a + vrshrn.i32 d25, q2, #12 // t56a + vrshrn.i32 d27, q3, #12 // t39a + vneg.s32 q4, q4 // t40a + vmull_vmlsl q2, d31, d28, d0[2], d0[3] // -> t55a + vrshrn.i32 d31, q4, #12 // t40a + vrshrn.i32 d28, q2, #12 // t55a + + vqadd.s16 d16, d24, d29 // t32a + vqsub.s16 d19, d24, d29 // t47a + vqadd.s16 d17, d27, d31 // t39 + vqsub.s16 d18, d27, d31 // t40 + vqsub.s16 d20, d26, d30 // t48a + vqadd.s16 d23, d26, d30 // t63a + vqsub.s16 d21, d25, d28 // t55 + vqadd.s16 d22, d25, d28 // t56 + + vmull_vmlsl q2, d21, d18, d0[0], d0[0] // -> t40a + vmull_vmlal q3, d21, d18, d0[0], d0[0] // -> t55a + vmull_vmlsl q4, d20, d19, d0[0], d0[0] // -> t47 + vrshrn.i32 d18, q2, #12 // t40a + vrshrn.i32 d21, q3, #12 // t55a + vmull_vmlal q2, d20, d19, d0[0], d0[0] // -> t48 + vrshrn.i32 d19, q4, #12 // t47 + vrshrn.i32 d20, q2, #12 // t48 + + vstr d16, [r6, #2*4*0] // t32a + vstr d17, [r9, #2*4*0] // t39 + vstr d18, [r6, #2*4*8] // t40a + vstr d19, [r9, #2*4*8] // t47 + vstr d20, [r6, #2*4*16] // t48 + vstr d21, [r9, #2*4*16] // t55a + vstr d22, [r6, #2*4*24] // t56 + vstr d23, [r9, #2*4*24] // t63a + + add r6, r6, #2*4 + sub r9, r9, #2*4 + cmp r6, r9 + blt 1b + bx lr +endfunc + +.macro load8 src, strd, zero, clear +.irp i, d16, d17, d18, d19, d20, d21, d22, d23 +.if \clear + vld1.16 {\i}, [\src, :64] + vst1.16 {\zero}, [\src, :64], \strd +.else + vld1.16 {\i}, [\src, :64], \strd +.endif +.endr +.endm + +.macro store16 dst + vst1.16 {q8, q9}, [\dst, :128]! + vst1.16 {q10, q11}, [\dst, :128]! + vst1.16 {q12, q13}, [\dst, :128]! + vst1.16 {q14, q15}, [\dst, :128]! +.endm + +.macro clear_upper8 +.irp i, q12, q13, q14, q15 + vmov.i16 \i, #0 +.endr +.endm + +.macro vmov_if reg, val, cond +.if \cond + vmov.i16 \reg, \val +.endif +.endm + +.macro movdup_if reg, gpr, val, cond +.if \cond + movw \gpr, \val + vdup.16 \reg, \gpr +.endif +.endm + +.macro vst1_if regs, dst, dstalign, cond +.if \cond + vst1.16 \regs, \dst, \dstalign +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4h_x64_neon, export=1 + mov r6, sp + + push {r10-r11,lr} + + lsl r8, r8, #2 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + add r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct_4h_x16_neon + + store16 r6 + + movdup_if d0, r12, #2896*8, \scale + vmov_if d7, #0, \clear + load8 r7, r8, d7, \clear + clear_upper8 + sub r7, r7, r8, lsl #3 + lsr r8, r8, #1 + sub r7, r7, r8, lsr #1 + scale_if \scale, d0[0], q8, q9, q10, q11 + + bl inv_dct32_odd_4h_x16_neon + + add r10, r6, #8*15 + sub r6, r6, #8*16 + + mov r9, #-8 + +.macro store_addsub r0, r1, r2, r3 + vld1.16 {d2}, [r6, :64]! + vld1.16 {d3}, [r6, :64]! + vqadd.s16 d6, d2, \r0 + vqsub.s16 \r0, d2, \r0 + vld1.16 {d4}, [r6, :64]! + vqadd.s16 d7, d3, \r1 + vqsub.s16 \r1, d3, \r1 + vld1.16 {d5}, [r6, :64]! + vqadd.s16 d2, d4, \r2 + sub r6, r6, #8*4 + vqsub.s16 \r2, d4, \r2 + vst1.16 {d6}, [r6, :64]! + vst1.16 {\r0}, [r10, :64], r9 + vqadd.s16 d3, d5, \r3 + vqsub.s16 \r3, d5, \r3 + vst1.16 {d7}, [r6, :64]! + vst1.16 {\r1}, [r10, :64], r9 + vst1.16 {d2}, [r6, :64]! + vst1.16 {\r2}, [r10, :64], r9 + vst1.16 {d3}, [r6, :64]! + vst1.16 {\r3}, [r10, :64], r9 +.endm + store_addsub d31, d30, d29, d28 + store_addsub d27, d26, d25, d24 + store_addsub d23, d22, d21, d20 + store_addsub d19, d18, d17, d16 +.purgem store_addsub + + add r6, r6, #2*4*16 + + movrel_local r12, idct64_coeffs + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r9, r7, r8, lsl #4 // offset 16 + add r10, r7, r8, lsl #3 // offset 8 + sub r9, r9, r8 // offset 15 + sub r11, r10, r8 // offset 7 + vld1.16 {d16}, [r7, :64] // in1 (offset 0) + vld1.16 {d17}, [r9, :64] // in31 (offset 15) + vld1.16 {d18}, [r10, :64] // in17 (offset 8) + vld1.16 {d19}, [r11, :64] // in15 (offset 7) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + add r7, r7, r8, lsl #2 // offset 4 + sub r9, r9, r8, lsl #2 // offset 11 + sub r10, r7, r8 // offset 3 + add r11, r9, r8 // offset 12 + vld1.16 {d16}, [r10, :64] // in7 (offset 3) + vld1.16 {d17}, [r11, :64] // in25 (offset 12) + vld1.16 {d18}, [r9, :64] // in23 (offset 11) + vld1.16 {d19}, [r7, :64] // in9 (offset 4) + vst1_if {d7}, [r7, :64], \clear + vst1_if {d7}, [r9, :64], \clear + vst1_if {d7}, [r10, :64], \clear + vst1_if {d7}, [r11, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8, lsl #1 // offset 1 + sub r9, r9, r8, lsl #1 // offset 9 + add r10, r10, r8 // offset 2 + add r9, r9, r8 // offset 10 + add r7, r7, r8 // offset 5 + add r11, r11, r8 // offset 13 + vld1.16 d16, [r10, :64] // in5 (offset 2) + vld1.16 d17, [r11, :64] // in27 (offset 13) + vld1.16 d18, [r9, :64] // in21 (offset 10) + vld1.16 d19, [r7, :64] // in11 (offset 5) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + movdup_if d0, lr, #2896*8, \scale + vmov_if d7, #0, \clear + sub r10, r10, r8 // offset 1 + sub r9, r9, r8 // offset 9 + add r11, r11, r8 // offset 14 + add r7, r7, r8 // offset 6 + vld1.16 d16, [r10, :64] // in3 (offset 1) + vld1.16 d17, [r11, :64] // in29 (offset 14) + vld1.16 d18, [r9, :64] // in19 (offset 9) + vld1.16 d19, [r7, :64] // in13 (offset 6) + vst1_if d7, [r10, :64], \clear + vst1_if d7, [r11, :64], \clear + vst1_if d7, [r9, :64], \clear + vst1_if d7, [r7, :64], \clear + scale_if \scale, d0[0], q8, q9 + bl inv_dct64_step1_neon + + sub r6, r6, #2*4*32 + add r9, r6, #2*4*7 + + bl inv_dct64_step2_neon + + pop {r10-r11,pc} +endfunc +.endm + +def_dct64_func +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + +function inv_txfm_horz_dct_64x4_neon + vdup.16 q3, r9 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, #2*56 + + push {r10-r11,lr} + + mov r10, #2*64 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + transpose_4x4h q8, q9, d16, d17, d18, d19 + transpose_4x4h q15, q14, d31, d30, d29, d28 + transpose_4x4h q10, q11, d20, d21, d22, d23 + transpose_4x4h q13, q12, d27, d26, d25, d24 + +.macro store_addsub src0, src1, src2, src3 + vqsub.s16 d3, \src0, \src1 + vqsub.s16 d2, \src2, \src3 + vqadd.s16 d0, \src0, \src1 + vqadd.s16 d1, \src2, \src3 + vrshl.s16 q1, q1, q3 + vrshl.s16 q0, q0, q3 + vrev64.16 q1, q1 + vst1.16 {q0}, [r6, :128], r10 + vst1.16 {q1}, [r9, :128], r10 +.endm + store_addsub d16, d31, d20, d27 + store_addsub d17, d30, d21, d26 + store_addsub d18, d29, d22, d25 + store_addsub d19, d28, d23, d24 +.purgem store_addsub + sub r6, r6, r10, lsl #2 + sub r9, r9, r10, lsl #2 + add r6, r6, #16 + sub r9, r9, #16 + + cmp r7, r8 + blt 1b + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_vert_dct_4x64_neon + lsl r8, r8, #1 + + mov r7, sp + add r8, sp, #2*4*(64 - 4) + add r9, r6, r1, lsl #6 + sub r9, r9, r1 + + push {r10-r11,lr} + + neg r10, r1 + mov r11, #-2*4*4 + +1: + vld1.16 {d16, d17, d18, d19}, [r7, :128]! + vld1.16 {d28, d29, d30, d31}, [r8, :128], r11 + vld1.16 {d20, d21, d22, d23}, [r7, :128]! + vld1.16 {d24, d25, d26, d27}, [r8, :128], r11 + +.macro add_dest_addsub src0, src1, src2, src3 + vld1.32 {d0[0]}, [r6, :32], r1 + vld1.32 {d1[0]}, [r9, :32], r10 + vqadd.s16 d4, \src0, \src1 + vld1.32 {d0[1]}, [r6, :32] + vqadd.s16 d5, \src2, \src3 + vld1.32 {d1[1]}, [r9, :32] + vqsub.s16 d6, \src0, \src1 + vqsub.s16 d7, \src2, \src3 + sub r6, r6, r1 + sub r9, r9, r10 + vrshr.s16 q2, q2, #4 + vrshr.s16 q3, q3, #4 + vaddw.u8 q2, q2, d0 + vaddw.u8 q3, q3, d1 + vqmovun.s16 d0, q2 + vqmovun.s16 d1, q3 + vst1.32 {d0[0]}, [r6, :32], r1 + vst1.32 {d1[0]}, [r9, :32], r10 + vst1.32 {d0[1]}, [r6, :32], r1 + vst1.32 {d1[1]}, [r9, :32], r10 +.endm + add_dest_addsub d16, d31, d17, d30 + add_dest_addsub d18, d29, d19, d28 + add_dest_addsub d20, d27, d21, d26 + add_dest_addsub d22, d25, d23, d24 +.purgem add_dest_addsub + cmp r7, r8 + blt 1b + + pop {r10-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 + idct_dc 64, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 + idct_dc 64, 32, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*64*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_clear_scale_4h_x64_neon + add r6, r5, #(\i*64*2) + mov r9, #-1 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r5, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_dct_4x32_neon +.endr + + add_sp_align 64*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 + idct_dc 32, 64, 1 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 32*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_32x32 + ldrh r11, [r10], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*32*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r7, r5, #(\i*2) + mov r8, #32*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 32*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 + idct_dc 64, 16, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 64*16*2+64*4*2 + add r4, sp, #64*4*2 + + movrel_local r10, eob_16x32 + +.irp i, 0, 4, 8, 12 + add r6, r4, #(\i*64*2) +.if \i > 0 + mov r8, #(16 - \i) + cmp r3, r11 + blt 1f +.endif + add r7, r2, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_clear_4h_x64_neon + add r6, r4, #(\i*64*2) + mov r9, #-2 // shift + bl inv_txfm_horz_dct_64x4_neon +.if \i < 8 + ldrh r11, [r10], #2 +.endif +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #2 +.rept 8 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: + movrel_local r5, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 + add r6, r0, #(\i) + add r7, r4, #(\i*2) + mov r8, #64*2 + bl inv_txfm_add_vert_4x16_neon +.endr + + add_sp_align 64*16*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc + +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 + idct_dc 16, 64, 2 + + push {r4-r11,lr} + vpush {q4} + + sub_sp_align 16*32*2+64*4*2 + add r5, sp, #64*4*2 + + movrel_local r10, eob_16x32 + ldrh r11, [r10], #2 + + movrel_local r4, inv_dct_4h_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add r6, r5, #(\i*16*2) +.if \i > 0 + mov r8, #(32 - \i) + cmp r3, r11 + blt 1f + ldrh r11, [r10], #2 +.endif + add r7, r2, #(\i*2) + mov r8, #32*2 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + vmov.i16 q2, #0 + vmov.i16 q3, #0 +2: + subs r8, r8, #4 +.rept 4 + vst1.16 {q2, q3}, [r6, :128]! +.endr + bgt 2b + +3: +.irp i, 0, 4, 8, 12 + add r7, r5, #(\i*2) + mov r8, #16*2 + bl inv_txfm_dct_4h_x64_neon + add r6, r0, #(\i) + bl inv_txfm_add_vert_dct_4x64_neon +.endr + + add_sp_align 16*32*2+64*4*2 + vpop {q4} + pop {r4-r11,pc} +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/32/mc.S b/ffmpeg/JNI/dav1d/src/arm/32/mc.S index 36f6c2e2b..47631c071 100644 --- a/ffmpeg/JNI/dav1d/src/arm/32/mc.S +++ b/ffmpeg/JNI/dav1d/src/arm/32/mc.S @@ -3168,3 +3168,184 @@ endfunc warp , 11 warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + push {r4-r11,lr} + ldrd r4, r5, [sp, #36] + ldrd r6, r7, [sp, #44] + ldrd r8, r9, [sp, #52] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub r12, r3, #1 // ih - 1 + cmp r5, r3 + sub lr, r2, #1 // iw - 1 + it lt + movlt r12, r5 // min(y, ih - 1) + cmp r4, r2 + bic r12, r12, r12, asr #31 // max(min(y, ih - 1), 0) + it lt + movlt lr, r4 // min(x, iw - 1) + bic lr, lr, lr, asr #31 // max(min(x, iw - 1), 0) + mla r8, r12, r9, r8 // ref += iclip() * stride + add r8, r8, lr // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add r10, r5, r1 // y + bh + neg r5, r5 // -y + sub r10, r10, r3 // y + bh - ih + sub r12, r1, #1 // bh - 1 + cmp r10, r1 + bic r5, r5, r5, asr #31 // max(-y, 0) + it ge + movge r10, r12 // min(y + bh - ih, bh-1) + cmp r5, r1 + bic r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0) + it ge + movge r5, r12 // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add r11, r4, r0 // x + bw + neg r4, r4 // -x + sub r11, r11, r2 // x + bw - iw + sub lr, r0, #1 // bw - 1 + cmp r11, r0 + bic r4, r4, r4, asr #31 // max(-x, 0) + it ge + movge r11, lr // min(x + bw - iw, bw-1) + cmp r4, r0 + bic r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0) + it ge + movge r4, lr // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub r1, r1, r5 // bh - top_ext + mla r6, r5, r7, r6 + sub r2, r0, r4 // bw - left_ext + sub r1, r1, r10 // center_h = bh - top_ext - bottom_ext + sub r2, r2, r11 // center_w = bw - left_ext - right_ext + + mov r0, r6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + vld1.8 {d0[]}, [r8] + mov r12, r6 // out = dst + mov r3, r4 + vmov d1, d0 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12, :128]! + bgt 1b +.endif + mov lr, r8 + add r12, r6, r4 // out = dst + left_ext + mov r3, r2 +1: + vld1.8 {q0, q1}, [lr]! + subs r3, r3, #32 +.if \need_left + vst1.8 {q0, q1}, [r12]! +.else + vst1.8 {q0, q1}, [r12, :128]! +.endif + bgt 1b +.if \need_right + add r3, r8, r2 // in + center_w + sub r3, r3, #1 // in + center_w - 1 + add r12, r6, r4 // dst + left_ext + vld1.8 {d0[]}, [r3] + add r12, r12, r2 // out = dst + left_ext + center_w + mov r3, r11 + vmov d1, d0 +1: + subs r3, r3, #16 + vst1.8 {q0}, [r12]! + bgt 1b +.endif + + subs r1, r1, #1 // center_h-- + add r6, r6, r7 + add r8, r8, r9 + bgt 0b +.endm + + cmp r4, #0 + beq 2f + // need_left + cmp r11, #0 + beq 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cmp r11, #0 + beq 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + cmp r10, #0 + // Storing the original dst in r0 overwrote bw, recalculate it here + add r2, r2, r4 // center_w + left_ext + add r2, r2, r11 // bw = center_w + left_ext + right_ext + + beq 3f + // need_bottom + sub r8, r6, r7 // ref = dst - stride + mov r4, r2 +1: + vld1.8 {q0, q1}, [r8, :128]! + mov r3, r10 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r10, r6 // dst -= bottom_ext * stride + subs r4, r4, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + cmp r5, #0 + beq 3f + // need_top + mls r6, r7, r5, r0 // dst = stored_dst - top_ext * stride +1: + vld1.8 {q0, q1}, [r0, :128]! + mov r3, r5 +2: + subs r3, r3, #1 + vst1.8 {q0, q1}, [r6, :128], r7 + bgt 2b + mls r6, r7, r5, r6 // dst -= top_ext * stride + subs r2, r2, #32 // bw -= 32 + add r6, r6, #32 // dst += 32 + bgt 1b + +3: + pop {r4-r11,pc} +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/32/msac.S b/ffmpeg/JNI/dav1d/src/arm/32/msac.S new file mode 100644 index 000000000..b06e109dd --- /dev/null +++ b/ffmpeg/JNI/dav1d/src/arm/32/msac.S @@ -0,0 +1,575 @@ +/* + * Copyright © 2019, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +#define BUF_POS 0 +#define BUF_END 4 +#define DIF 8 +#define RNG 12 +#define CNT 16 +#define ALLOW_UPDATE_CDF 20 + +const coeffs + .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0 + .short 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +endconst + +const bits, align=4 + .short 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80 + .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000 +endconst + +.macro vld1_align_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vld1.16 {\q0}, [\src, :128] +.else + vld1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vld1_n d0, q0, q1, src, n +.if \n == 4 + vld1.16 {\d0}, [\src] +.elseif \n == 8 + vld1.16 {\q0}, [\src] +.else + vld1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vst1_align_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src, :64] +.elseif \n == 8 + vst1.16 {\q0}, [\src, :128] +.else + vst1.16 {\q0, \q1}, [\src, :128] +.endif +.endm + +.macro vst1_n d0, q0, q1, src, n +.if \n == 4 + vst1.16 {\d0}, [\src] +.elseif \n == 8 + vst1.16 {\q0}, [\src] +.else + vst1.16 {\q0, \q1}, [\src] +.endif +.endm + +.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshr.u16 \d0, \s0, \s3 +.else + vshr.u16 \d1, \s1, \s4 +.if \n == 16 + vshr.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vadd.i16 \d0, \s0, \s3 +.else + vadd.i16 \d1, \s1, \s4 +.if \n == 16 + vadd.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vsub.i16 \d0, \s0, \s3 +.else + vsub.i16 \d1, \s1, \s4 +.if \n == 16 + vsub.i16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vand \d0, \s0, \s3 +.else + vand \d1, \s1, \s4 +.if \n == 16 + vand \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vcge.u16 \d0, \s0, \s3 +.else + vcge.u16 \d1, \s1, \s4 +.if \n == 16 + vcge.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vrhadd.u16 \d0, \s0, \s3 +.else + vrhadd.u16 \d1, \s1, \s4 +.if \n == 16 + vrhadd.u16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vshl.s16 \d0, \s0, \s3 +.else + vshl.s16 \d1, \s1, \s4 +.if \n == 16 + vshl.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n +.if \n == 4 + vqdmulh.s16 \d0, \s0, \s3 +.else + vqdmulh.s16 \d1, \s1, \s4 +.if \n == 16 + vqdmulh.s16 \d2, \s2, \s5 +.endif +.endif +.endm + +// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf, +// size_t n_symbols); + +function msac_decode_symbol_adapt4_neon, export=1 +.macro decode_update n + push {r4-r10,lr} + sub sp, sp, #48 + add r8, r0, #RNG + + vld1_align_n d0, q0, q1, r1, \n // cdf + vld1.16 {d16[]}, [r8, :16] // rng + movrel_local r9, coeffs, 30 + vmov.i16 d30, #0x7f00 // 0x7f00 + sub r9, r9, r2, lsl #1 + vmvn.i16 q14, #0x3f // 0xffc0 + add r8, sp, #14 + vand d22, d16, d30 // rng & 0x7f00 + vst1.16 {d16[0]}, [r8, :16] // store original u = s->rng + vand_n d4, q2, q3, d0, q0, q1, d28, q14, q14, \n // cdf & 0xffc0 +.if \n > 4 + vmov d23, d22 +.endif + + vld1_n d16, q8, q9, r9, \n // EC_MIN_PROB * (n_symbols - ret) + vqdmulh_n d20, q10, q11, d4, q2, q3, d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r8, r0, #DIF + 2 + + vadd_n d16, q8, q9, d4, q2, q3, d16, q8, q9, \n // v = cdf + EC_MIN_PROB * (n_symbols - ret) +.if \n == 4 + vmov.i16 d17, #0 +.endif + vadd_n d16, q8, q9, d20, q10, q11, d16, q8, q9, \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + + add r9, sp, #16 + vld1.16 {d20[]}, [r8, :16] // dif >> (EC_WIN_SIZE - 16) + movrel_local r8, bits + vst1_n q8, q8, q9, r9, \n // store v values to allow indexed access + + vmov d21, d20 + vld1_align_n q12, q12, q13, r8, \n +.if \n == 16 + vmov q11, q10 +.endif + + vcge_n q2, q2, q3, q10, q10, q11, q8, q8, q9, \n // c >= v + + vand_n q10, q10, q11, q2, q2, q3, q12, q12, q13, \n // One bit per halfword set in the mask +.if \n == 16 + vadd.i16 q10, q10, q11 +.endif + vadd.i16 d20, d20, d21 // Aggregate mask bits + ldr r4, [r0, #ALLOW_UPDATE_CDF] + vpadd.i16 d20, d20, d20 + lsl r10, r2, #1 + vpadd.i16 d20, d20, d20 + vmov.u16 r3, d20[0] + cmp r4, #0 + rbit r3, r3 + clz lr, r3 // ret + + beq L(renorm) + // update_cdf + ldrh r3, [r1, r10] // count = cdf[n_symbols] + vmov.i8 q10, #0xff +.if \n == 16 + mov r4, #-5 +.else + mvn r12, r2 + mov r4, #-4 + cmn r12, #3 // set C if n_symbols <= 2 +.endif + vrhadd_n d16, q8, q9, d20, q10, q10, d4, q2, q3, \n // i >= val ? -1 : 32768 +.if \n == 16 + sub r4, r4, r3, lsr #4 // -((count >> 4) + 5) +.else + lsr r12, r3, #4 // count >> 4 + sbc r4, r4, r12 // -((count >> 4) + (n_symbols > 2) + 4) +.endif + vsub_n d16, q8, q9, d16, q8, q9, d0, q0, q1, \n // (32768 - cdf[i]) or (-1 - cdf[i]) +.if \n == 4 + vdup.16 d20, r4 // -rate +.else + vdup.16 q10, r4 // -rate +.endif + + sub r3, r3, r3, lsr #5 // count - (count == 32) + vsub_n d0, q0, q1, d0, q0, q1, d4, q2, q3, \n // cdf + (i >= val ? 1 : 0) + vshl_n d16, q8, q9, d16, q8, q9, d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate + add r3, r3, #1 // count + (count < 32) + vadd_n d0, q0, q1, d0, q0, q1, d16, q8, q9, \n // cdf + (32768 - cdf[i]) >> rate + vst1_align_n d0, q0, q1, r1, \n + strh r3, [r1, r10] +.endm + + decode_update 4 + +L(renorm): + add r8, sp, #16 + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) +L(renorm2): + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 8 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 8 - c + +9: + str r6, [r0, #CNT] + str r7, [r0, #DIF] + + mov r0, lr + add sp, sp, #48 + + pop {r4-r10,pc} +endfunc + +function msac_decode_symbol_adapt8_neon, export=1 + decode_update 8 + b L(renorm) +endfunc + +function msac_decode_symbol_adapt16_neon, export=1 + decode_update 16 + b L(renorm) +endfunc + +function msac_decode_hi_tok_neon, export=1 + push {r4-r10,lr} + vld1.16 {d0}, [r1, :64] // cdf + add r4, r0, #RNG + vmov.i16 d31, #0x7f00 // 0x7f00 + movrel_local r5, coeffs, 30-2*3 + vmvn.i16 d30, #0x3f // 0xffc0 + ldrh r9, [r1, #6] // count = cdf[n_symbols] + vld1.16 {d1[]}, [r4, :16] // rng + movrel_local r4, bits + vld1.16 {d29}, [r5] // EC_MIN_PROB * (n_symbols - ret) + add r5, r0, #DIF + 2 + vld1.16 {q8}, [r4, :128] + mov r2, #-24 + vand d20, d0, d30 // cdf & 0xffc0 + ldr r10, [r0, #ALLOW_UPDATE_CDF] + vld1.16 {d2[]}, [r5, :16] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr r6, [r0, #CNT] + ldr r7, [r0, #DIF] + vmov d3, d2 +1: + vand d23, d1, d31 // rng & 0x7f00 + vqdmulh.s16 d18, d20, d23 // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add r12, sp, #14 + vadd.i16 d6, d20, d29 // v = cdf + EC_MIN_PROB * (n_symbols - ret) + vadd.i16 d6, d18, d6 // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + vmov.i16 d7, #0 + vst1.16 {d1[0]}, [r12, :16] // store original u = s->rng + add r12, sp, #16 + vcge.u16 q2, q1, q3 // c >= v + vst1.16 {q3}, [r12] // store v values to allow indexed access + vand q9, q2, q8 // One bit per halfword set in the mask + + vadd.i16 d18, d18, d19 // Aggregate mask bits + vpadd.i16 d18, d18, d18 + vpadd.i16 d18, d18, d18 + vmov.u16 r3, d18[0] + cmp r10, #0 + add r2, r2, #5 + rbit r3, r3 + add r8, sp, #16 + clz lr, r3 // ret + + beq 2f + // update_cdf + vmov.i8 d22, #0xff + mov r4, #-5 + vrhadd.u16 d6, d22, d4 // i >= val ? -1 : 32768 + sub r4, r4, r9, lsr #4 // -((count >> 4) + 5) + vsub.i16 d6, d6, d0 // (32768 - cdf[i]) or (-1 - cdf[i]) + vdup.16 d18, r4 // -rate + + sub r9, r9, r9, lsr #5 // count - (count == 32) + vsub.i16 d0, d0, d4 // cdf + (i >= val ? 1 : 0) + vshl.s16 d6, d6, d18 // ({32768,-1} - cdf[i]) >> rate + add r9, r9, #1 // count + (count < 32) + vadd.i16 d0, d0, d6 // cdf + (32768 - cdf[i]) >> rate + vst1.16 {d0}, [r1, :64] + vand d20, d0, d30 // cdf & 0xffc0 + strh r9, [r1, #6] + +2: + add r8, r8, lr, lsl #1 + ldrh r3, [r8] // v + ldrh r4, [r8, #-2] // u + sub r4, r4, r3 // rng = u - v + clz r5, r4 // clz(rng) + eor r5, r5, #16 // d = clz(rng) ^ 16 + mvn r7, r7 // ~dif + add r7, r7, r3, lsl #16 // ~dif + (v << 16) + lsl r4, r4, r5 // rng << d + subs r6, r6, r5 // cnt -= d + lsl r7, r7, r5 // (~dif + (v << 16)) << d + str r4, [r0, #RNG] + vdup.16 d1, r4 + mvn r7, r7 // ~dif + bhs 9f + + // refill + ldr r3, [r0, #BUF_POS] // BUF_POS + ldr r4, [r0, #BUF_END] // BUF_END + add r5, r3, #4 + cmp r5, r4 + bgt 2f + + ldr r3, [r3] // next_bits + add r8, r6, #23 // shift_bits = cnt + 23 + add r6, r6, #16 // cnt += 16 + rev r3, r3 // next_bits = bswap(next_bits) + sub r5, r5, r8, lsr #3 // buf_pos -= shift_bits >> 3 + and r8, r8, #24 // shift_bits &= 24 + lsr r3, r3, r8 // next_bits >>= shift_bits + sub r8, r8, r6 // shift_bits -= 16 + cnt + str r5, [r0, #BUF_POS] + lsl r3, r3, r8 // next_bits <<= shift_bits + rsb r6, r8, #16 // cnt = cnt + 32 - shift_bits + eor r7, r7, r3 // dif ^= next_bits + b 9f + +2: // refill_eob + rsb r5, r6, #8 // c = 40 - cnt +3: + cmp r3, r4 + bge 4f + ldrb r8, [r3], #1 + lsl r8, r8, r5 + eor r7, r7, r8 + subs r5, r5, #8 + bge 3b + +4: // refill_eob_end + str r3, [r0, #BUF_POS] + rsb r6, r5, #8 // cnt = 40 - c + +9: + lsl lr, lr, #1 + sub lr, lr, #5 + lsr r12, r7, #16 + adds r2, r2, lr // carry = tok_br < 3 || tok == 15 + vdup.16 q1, r12 + bcc 1b // loop if !carry + add r2, r2, #30 + str r6, [r0, #CNT] + add sp, sp, #48 + str r7, [r0, #DIF] + lsr r0, r2, #1 + pop {r4-r10,pc} +endfunc + +function msac_decode_bool_equi_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + bic r4, r5, #0xff // r &= 0xff00 + add r4, r4, #8 + mov r2, #0 + subs r8, r7, r4, lsl #15 // dif - vw + lsr r4, r4, #1 // v + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_neon, export=1 + push {r4-r10,lr} + ldr r5, [r0, #RNG] + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + bic r1, r1, #0x3f // f &= ~63 + mul r4, r4, r1 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + b L(renorm2) +endfunc + +function msac_decode_bool_adapt_neon, export=1 + push {r4-r10,lr} + ldr r9, [r1] // cdf[0-1] + ldr r5, [r0, #RNG] + movw lr, #0xffc0 + ldr r6, [r0, #CNT] + sub sp, sp, #48 + ldr r7, [r0, #DIF] + lsr r4, r5, #8 // r >> 8 + and r2, r9, lr // f &= ~63 + mul r4, r4, r2 + mov r2, #0 + lsr r4, r4, #7 + add r4, r4, #4 // v + subs r8, r7, r4, lsl #16 // dif - vw + sub r5, r5, r4 // r - v + ldr r10, [r0, #ALLOW_UPDATE_CDF] + itee lo + movlo r2, #1 + movhs r4, r5 // if (ret) v = r - v; + movhs r7, r8 // if (ret) dif = dif - vw; + + cmp r10, #0 + clz r5, r4 // clz(rng) + mvn r7, r7 // ~dif + eor r5, r5, #16 // d = clz(rng) ^ 16 + mov lr, r2 + + beq L(renorm2) + + lsr r2, r9, #16 // count = cdf[1] + uxth r9, r9 // cdf[0] + + sub r3, r2, r2, lsr #5 // count - (count >= 32) + lsr r2, r2, #4 // count >> 4 + add r10, r3, #1 // count + (count < 32) + add r2, r2, #4 // rate = (count >> 4) | 4 + + sub r9, r9, lr // cdf[0] -= bit + sub r3, r9, lr, lsl #15 // {cdf[0], cdf[0] - 32769} + asr r3, r3, r2 // {cdf[0], cdf[0] - 32769} >> rate + sub r9, r9, r3 // cdf[0] + + strh r9, [r1] + strh r10, [r1, #2] + + b L(renorm2) +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/32/util.S b/ffmpeg/JNI/dav1d/src/arm/32/util.S index ea4afc38d..6af0158e0 100644 --- a/ffmpeg/JNI/dav1d/src/arm/32/util.S +++ b/ffmpeg/JNI/dav1d/src/arm/32/util.S @@ -84,6 +84,23 @@ vtrn.8 \r6, \r7 .endm +.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7 + vswp \d0, \d4 + vswp \d1, \d5 + vswp \d2, \d6 + vswp \d3, \d7 + + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + vtrn.32 \r4, \r6 + vtrn.32 \r5, \r7 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 + vtrn.16 \r4, \r5 + vtrn.16 \r6, \r7 +.endm + .macro transpose_4x8b q0, q1, r0, r1, r2, r3 vtrn.16 \q0, \q1 @@ -91,4 +108,19 @@ vtrn.8 \r2, \r3 .endm +.macro transpose_4x4h q0, q1, r0, r1, r2, r3 + vtrn.32 \q0, \q1 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + +.macro transpose_4x8h r0, r1, r2, r3 + vtrn.32 \r0, \r2 + vtrn.32 \r1, \r3 + + vtrn.16 \r0, \r1 + vtrn.16 \r2, \r3 +.endm + #endif /* DAV1D_SRC_ARM_32_UTIL_S */ diff --git a/ffmpeg/JNI/dav1d/src/arm/64/ipred.S b/ffmpeg/JNI/dav1d/src/arm/64/ipred.S index 9513212b3..e53665a20 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/ipred.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/ipred.S @@ -28,11 +28,11 @@ #include "src/arm/asm.S" #include "util.S" -// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_128_neon, export=1 +// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_128_tbl) sub w3, w3, #25 @@ -97,11 +97,11 @@ L(ipred_dc_128_tbl): .hword L(ipred_dc_128_tbl) - 4b endfunc -// void ipred_v_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_v_neon, export=1 +// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_v_tbl) sub w3, w3, #25 @@ -132,7 +132,7 @@ function ipred_v_neon, export=1 b.gt 8b ret 160: - ld1 {v0.16b}, [x2], #16 + ld1 {v0.16b}, [x2] 16: st1 {v0.16b}, [x0], x1 st1 {v0.16b}, [x6], x1 @@ -170,11 +170,11 @@ L(ipred_v_tbl): .hword L(ipred_v_tbl) - 40b endfunc -// void ipred_h_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_h_neon, export=1 +// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_h_tbl) sub w3, w3, #25 @@ -251,11 +251,11 @@ L(ipred_h_tbl): .hword L(ipred_h_tbl) - 4b endfunc -// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_top_neon, export=1 +// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 adr x5, L(ipred_dc_top_tbl) sub w3, w3, #25 @@ -351,11 +351,11 @@ L(ipred_dc_top_tbl): .hword L(ipred_dc_top_tbl) - 40b endfunc -// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_left_neon, export=1 +// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 @@ -472,11 +472,11 @@ L(ipred_dc_left_tbl): .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) endfunc -// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_dc_neon, export=1 +// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w7, w3, w4 // width + height clz w3, w3 @@ -608,7 +608,7 @@ L(ipred_dc_w32): cmp w4, #32 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v2.4h - ushl v0.4h, v0.4h, v17.4h + ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 8/16/64 cmp w4, #8 @@ -616,10 +616,10 @@ L(ipred_dc_w32): mov w17, #(0x5556/2) csel w16, w16, w17, eq dup v16.4h, w16 - sqdmulh v0.4h, v0.4h, v16.4h + sqdmulh v4.4h, v4.4h, v16.4h 1: - dup v0.16b, v0.b[0] - dup v1.16b, v0.b[0] + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] 2: st1 {v0.16b, v1.16b}, [x0], x1 st1 {v0.16b, v1.16b}, [x6], x1 @@ -640,10 +640,6 @@ L(ipred_dc_h64): add v0.4h, v0.4h, v2.4h br x3 L(ipred_dc_w64): - mov v1.16b, v0.16b - mov v2.16b, v0.16b - mov v3.16b, v0.16b -2: add x2, x2, #1 ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x2] add v0.4h, v0.4h, v16.4h @@ -656,19 +652,19 @@ L(ipred_dc_w64): cmp w4, #64 add v0.4h, v0.4h, v1.4h add v0.4h, v0.4h, v3.4h - ushl v0.4h, v0.4h, v17.4h + ushl v4.4h, v0.4h, v17.4h b.eq 1f // h = 16/32 mov w16, #(0x5556/2) movk w16, #(0x3334/2), lsl #16 lsr w16, w16, w4 dup v16.4h, w16 - sqdmulh v0.4h, v0.4h, v16.4h + sqdmulh v4.4h, v4.4h, v16.4h 1: - dup v0.16b, v0.b[0] - dup v1.16b, v0.b[0] - dup v2.16b, v0.b[0] - dup v3.16b, v0.b[0] + dup v0.16b, v4.b[0] + dup v1.16b, v4.b[0] + dup v2.16b, v4.b[0] + dup v3.16b, v4.b[0] 2: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1 st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1 @@ -691,11 +687,11 @@ L(ipred_dc_tbl): .hword L(ipred_dc_tbl) - L(ipred_dc_w4) endfunc -// void ipred_paeth_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_paeth_neon, export=1 +// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_8bpc_neon, export=1 clz w9, w3 adr x5, L(ipred_paeth_tbl) sub w9, w9, #25 @@ -868,11 +864,11 @@ L(ipred_paeth_tbl): .hword L(ipred_paeth_tbl) - 40b endfunc -// void ipred_smooth_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_smooth_neon, export=1 +// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_8bpc_neon, export=1 movrel x10, X(sm_weights) add x11, x10, w4, uxtw add x10, x10, w3, uxtw @@ -1046,11 +1042,11 @@ L(ipred_smooth_tbl): .hword L(ipred_smooth_tbl) - 40b endfunc -// void ipred_smooth_v_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_smooth_v_neon, export=1 +// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_8bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 @@ -1184,11 +1180,11 @@ L(ipred_smooth_v_tbl): .hword L(ipred_smooth_v_tbl) - 40b endfunc -// void ipred_smooth_h_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int a, -// const int max_width, const int max_height); -function ipred_smooth_h_neon, export=1 +// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_8bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 @@ -1327,11 +1323,11 @@ L(ipred_smooth_h_tbl): .hword L(ipred_smooth_h_tbl) - 40b endfunc -// void ipred_filter_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, const int filt_idx, -// const int max_width, const int max_height); -function ipred_filter_neon, export=1 +// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height); +function ipred_filter_8bpc_neon, export=1 and w5, w5, #511 movrel x6, X(filter_intra_taps) lsl w5, w5, #6 @@ -1487,10 +1483,10 @@ L(ipred_filter_tbl): .hword L(ipred_filter_tbl) - 40b endfunc -// void pal_pred_neon(pixel *dst, const ptrdiff_t stride, -// const uint16_t *const pal, const uint8_t *idx, -// const int w, const int h); -function pal_pred_neon, export=1 +// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_8bpc_neon, export=1 ld1 {v0.8h}, [x2] clz w9, w4 adr x6, L(pal_pred_tbl) @@ -1578,11 +1574,11 @@ L(pal_pred_tbl): .hword L(pal_pred_tbl) - 4b endfunc -// void ipred_cfl_128_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, -// const int16_t *ac, const int alpha); -function ipred_cfl_128_neon, export=1 +// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_128_tbl) sub w9, w9, #26 @@ -1699,11 +1695,11 @@ L(ipred_cfl_splat_tbl): .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) endfunc -// void ipred_cfl_top_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, -// const int16_t *ac, const int alpha); -function ipred_cfl_top_neon, export=1 +// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 adr x7, L(ipred_cfl_top_tbl) sub w9, w9, #26 @@ -1717,19 +1713,19 @@ function ipred_cfl_top_neon, export=1 4: ld1r {v0.2s}, [x2] uaddlv h0, v0.8b - urshr v0.8h, v0.8h, #3 + urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w4) 8: ld1 {v0.8b}, [x2] uaddlv h0, v0.8b - urshr v0.8h, v0.8h, #3 + urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w8) 16: ld1 {v0.16b}, [x2] uaddlv h0, v0.16b - urshr v0.8h, v0.8h, #4 + urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) 32: @@ -1737,7 +1733,7 @@ function ipred_cfl_top_neon, export=1 uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h - urshr v2.8h, v2.8h, #5 + urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) @@ -1748,11 +1744,11 @@ L(ipred_cfl_top_tbl): .hword L(ipred_cfl_top_tbl) - 4b endfunc -// void ipred_cfl_left_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, -// const int16_t *ac, const int alpha); -function ipred_cfl_left_neon, export=1 +// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 @@ -1772,21 +1768,21 @@ function ipred_cfl_left_neon, export=1 L(ipred_cfl_left_h4): ld1r {v0.2s}, [x2] uaddlv h0, v0.8b - urshr v0.8h, v0.8h, #3 + urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h8): ld1 {v0.8b}, [x2] uaddlv h0, v0.8b - urshr v0.8h, v0.8h, #3 + urshr v0.4h, v0.4h, #3 dup v0.8h, v0.h[0] br x9 L(ipred_cfl_left_h16): ld1 {v0.16b}, [x2] uaddlv h0, v0.16b - urshr v0.8h, v0.8h, #4 + urshr v0.4h, v0.4h, #4 dup v0.8h, v0.h[0] br x9 @@ -1795,7 +1791,7 @@ L(ipred_cfl_left_h32): uaddlv h2, v2.16b uaddlv h3, v3.16b add v2.4h, v2.4h, v3.4h - urshr v2.8h, v2.8h, #5 + urshr v2.4h, v2.4h, #5 dup v0.8h, v2.h[0] br x9 @@ -1806,11 +1802,11 @@ L(ipred_cfl_left_tbl): .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) endfunc -// void ipred_cfl_neon(pixel *dst, const ptrdiff_t stride, -// const pixel *const topleft, -// const int width, const int height, -// const int16_t *ac, const int alpha); -function ipred_cfl_neon, export=1 +// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha); +function ipred_cfl_8bpc_neon, export=1 sub x2, x2, w4, uxtw add w8, w3, w4 // width + height dup v1.8h, w6 // alpha @@ -1946,15 +1942,19 @@ L(ipred_cfl_tbl): .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) endfunc -// void cfl_ac_420_neon(int16_t *const ac, const pixel *const ypx, -// const ptrdiff_t stride, const int w_pad, -// const int h_pad, const int cw, const int ch); -function ipred_cfl_ac_420_neon, export=1 +// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) @@ -1962,14 +1962,10 @@ function ipred_cfl_ac_420_neon, export=1 clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz - movi v16.4s, #1 add x10, x1, x2 + dup v31.4s, w9 lsl x2, x2, #1 - dup v17.4s, w9 - sshl v16.4s, v16.4s, v17.4s // 1 << log2sz - neg v17.4s, v17.4s // -log2sz - ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1) - mov w9, w6 + neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_420_w4): @@ -1984,6 +1980,7 @@ L(ipred_cfl_ac_420_w4): shl v0.8h, v0.8h, #1 subs w8, w8, #2 st1 {v0.8h}, [x0], #16 + add v16.8h, v16.8h, v0.8h b.gt 1b trn2 v1.2d, v0.2d, v0.2d trn2 v0.2d, v0.2d, v0.2d @@ -1992,29 +1989,19 @@ L(ipred_cfl_ac_420_w4_hpad): 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h b.gt 2b 3: - sub x0, x0, w6, uxtw #3 - // Sum the produced ac values - subs w6, w6, #4 - ld1 {v0.8h, v1.8h}, [x0], #32 - b.le 5f -4: - ld1 {v2.8h, v3.8h}, [x0], #32 - subs w6, w6, #4 - add v0.8h, v0.8h, v2.8h - add v1.8h, v1.8h, v3.8h - b.gt 4b -5: - add v0.8h, v0.8h, v1.8h + // Aggregate the sums + add v0.8h, v16.8h, v17.8h uaddlv s0, v0.8h // sum - sub x0, x0, w9, uxtw #3 - add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1) - ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h}, [x0] - subs w9, w9, #4 + subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h st1 {v0.8h, v1.8h}, [x0], #32 @@ -2038,6 +2025,8 @@ L(ipred_cfl_ac_420_w8): shl v1.8h, v2.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h b.gt 1b mov v0.16b, v1.16b b L(ipred_cfl_ac_420_w8_hpad) @@ -2057,6 +2046,10 @@ L(ipred_cfl_ac_420_w8_wpad): trn2 v2.2d, v0.2d, v0.2d subs w8, w8, #2 st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + add v16.4h, v16.4h, v0.4h + add v17.4h, v17.4h, v1.4h + add v18.4h, v18.4h, v2.4h + add v19.4h, v19.4h, v3.4h b.gt 1b trn1 v0.2d, v2.2d, v3.2d trn1 v1.2d, v2.2d, v3.2d @@ -2066,38 +2059,28 @@ L(ipred_cfl_ac_420_w8_hpad): 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h}, [x0], #32 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 + add v18.8h, v18.8h, v0.8h + add v19.8h, v19.8h, v1.8h b.gt 2b 3: L(ipred_cfl_ac_420_w8_calc_subtract_dc): - sub x0, x0, w6, uxtw #4 - // Sum the produced ac values - subs w6, w6, #4 - ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 - b.le 5f -4: - ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 - subs w6, w6, #4 - add v0.8h, v0.8h, v4.8h - add v1.8h, v1.8h, v5.8h - add v2.8h, v2.8h, v6.8h - add v3.8h, v3.8h, v7.8h - b.gt 4b -5: - add v0.8h, v0.8h, v1.8h - add v2.8h, v2.8h, v3.8h + // Aggregate the sums + add v0.8h, v16.8h, v17.8h + add v2.8h, v18.8h, v19.8h uaddlp v0.4s, v0.8h uaddlp v2.4s, v2.8h add v0.4s, v0.4s, v2.4s addv s0, v0.4s // sum - sub x0, x0, w9, uxtw #4 - add v0.2s, v0.2s, v16.2s // sum += 1 << (log2sz - 1) - ushl v4.2s, v0.2s, v17.2s // sum >>= log2sz + sub x0, x0, w6, uxtw #4 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz dup v4.8h, v4.h[0] 6: // Subtract dc from ac ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] - subs w9, w9, #4 + subs w6, w6, #4 sub v0.8h, v0.8h, v4.8h sub v1.8h, v1.8h, v4.8h sub v2.8h, v2.8h, v4.8h @@ -2136,6 +2119,10 @@ L(ipred_cfl_ac_420_w16_wpad0): shl v3.8h, v5.8h, #1 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2173,6 +2160,10 @@ L(ipred_cfl_ac_420_w16_wpad1): trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2196,6 +2187,10 @@ L(ipred_cfl_ac_420_w16_wpad2): dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2221,6 +2216,10 @@ L(ipred_cfl_ac_420_w16_wpad3): trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2231,7 +2230,15 @@ L(ipred_cfl_ac_420_w16_hpad): 2: // Vertical padding (h_pad > 0) subs w4, w4, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 2b 3: @@ -2253,15 +2260,19 @@ L(ipred_cfl_ac_420_w16_tbl): .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) endfunc -// void cfl_ac_422_neon(int16_t *const ac, const pixel *const ypx, -// const ptrdiff_t stride, const int w_pad, -// const int h_pad, const int cw, const int ch); -function ipred_cfl_ac_422_neon, export=1 +// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 adr x7, L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 ldrh w8, [x7, w8, uxtw #1] + movi v16.8h, #0 + movi v17.8h, #0 + movi v18.8h, #0 + movi v19.8h, #0 sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) @@ -2269,14 +2280,10 @@ function ipred_cfl_ac_422_neon, export=1 clz w9, w9 // ctz(width) clz w10, w10 // ctz(height) add w9, w9, w10 // log2sz - movi v16.4s, #1 add x10, x1, x2 + dup v31.4s, w9 lsl x2, x2, #1 - dup v17.4s, w9 - sshl v16.4s, v16.4s, v17.4s // 1 << log2sz - neg v17.4s, v17.4s // -log2sz - ushr v16.4s, v16.4s, #1 // 1 << (log2sz - 1) - mov w9, w6 + neg v31.4s, v31.4s // -log2sz br x7 L(ipred_cfl_ac_422_w4): @@ -2290,6 +2297,8 @@ L(ipred_cfl_ac_422_w4): shl v0.8h, v0.8h, #2 shl v1.8h, v1.8h, #2 subs w8, w8, #4 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h st1 {v0.8h, v1.8h}, [x0], #32 b.gt 1b trn2 v0.2d, v1.2d, v1.2d @@ -2313,6 +2322,10 @@ L(ipred_cfl_ac_422_w8): shl v3.8h, v3.8h, #2 subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b @@ -2338,6 +2351,10 @@ L(ipred_cfl_ac_422_w8_wpad): trn1 v2.2d, v2.2d, v6.2d subs w8, w8, #4 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v3.16b mov v1.16b, v3.16b @@ -2363,6 +2380,10 @@ L(ipred_cfl_ac_422_w16_wpad0): shl v3.8h, v3.8h, #2 subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2388,6 +2409,10 @@ L(ipred_cfl_ac_422_w16_wpad1): trn1 v3.2d, v3.2d, v5.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2405,6 +2430,10 @@ L(ipred_cfl_ac_422_w16_wpad2): dup v3.8h, v2.h[7] subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b @@ -2424,6 +2453,10 @@ L(ipred_cfl_ac_422_w16_wpad3): trn1 v2.2d, v2.2d, v3.2d subs w8, w8, #2 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + add v16.8h, v16.8h, v0.8h + add v17.8h, v17.8h, v1.8h + add v18.8h, v18.8h, v2.8h + add v19.8h, v19.8h, v3.8h b.gt 1b mov v0.16b, v2.16b mov v1.16b, v3.16b diff --git a/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S b/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S new file mode 100644 index 000000000..5c139490f --- /dev/null +++ b/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S @@ -0,0 +1,2834 @@ +/* + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2019, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + +// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height, +// const int bitdepth_max); +function ipred_dc_128_16bpc_neon, export=1 + ldr w8, [sp] + clz w3, w3 + adr x5, L(ipred_dc_128_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + dup v0.8h, w8 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + urshr v0.8h, v0.8h, #1 + br x5 +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + mov v1.16b, v0.16b +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_128_tbl): + .hword L(ipred_dc_128_tbl) - 640b + .hword L(ipred_dc_128_tbl) - 320b + .hword L(ipred_dc_128_tbl) - 160b + .hword L(ipred_dc_128_tbl) - 8b + .hword L(ipred_dc_128_tbl) - 4b +endfunc + +// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_v_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_v_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1 {v0.4h}, [x2] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8h}, [x2] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.8h, v1.8h}, [x2] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + sub x1, x1, #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_v_tbl): + .hword L(ipred_v_tbl) - 640b + .hword L(ipred_v_tbl) - 320b + .hword L(ipred_v_tbl) - 160b + .hword L(ipred_v_tbl) - 80b + .hword L(ipred_v_tbl) - 40b +endfunc + +// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_h_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_h_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + sub x2, x2, #8 + sub x5, x5, w3, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +4: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.4h}, [x0], x1 + st1 {v2.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +16: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 16b + ret +32: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 32b + ret +64: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 + str q3, [x0, #16] + str q2, [x6, #16] + stp q3, q3, [x0, #32] + stp q2, q2, [x6, #32] + stp q3, q3, [x0, #64] + stp q2, q2, [x6, #64] + stp q3, q3, [x0, #96] + stp q2, q2, [x6, #96] + st1 {v3.8h}, [x0], x1 + st1 {v2.8h}, [x6], x1 + subs w4, w4, #4 + str q1, [x0, #16] + str q0, [x6, #16] + stp q1, q1, [x0, #32] + stp q0, q0, [x6, #32] + stp q1, q1, [x0, #64] + stp q0, q0, [x6, #64] + stp q1, q1, [x0, #96] + stp q0, q0, [x6, #96] + st1 {v1.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_h_tbl): + .hword L(ipred_h_tbl) - 64b + .hword L(ipred_h_tbl) - 32b + .hword L(ipred_h_tbl) - 16b + .hword L(ipred_h_tbl) - 8b + .hword L(ipred_h_tbl) - 4b +endfunc + +// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_top_16bpc_neon, export=1 + clz w3, w3 + adr x5, L(ipred_dc_top_tbl) + sub w3, w3, #25 + ldrh w3, [x5, w3, uxtw #1] + add x2, x2, #2 + sub x5, x5, w3, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.4h, v0.h[0] +4: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 4b + ret +80: + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] +8: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 8b + ret +160: + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] +16: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 16b + ret +320: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +32: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 32b + ret +640: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +64: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 64b + ret + +L(ipred_dc_top_tbl): + .hword L(ipred_dc_top_tbl) - 640b + .hword L(ipred_dc_top_tbl) - 320b + .hword L(ipred_dc_top_tbl) - 160b + .hword L(ipred_dc_top_tbl) - 80b + .hword L(ipred_dc_top_tbl) - 40b +endfunc + +// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_left_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + clz w3, w3 + clz w7, w4 + adr x5, L(ipred_dc_left_tbl) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w7, w7, #25 + ldrh w3, [x5, w3, uxtw #1] + ldrh w7, [x5, w7, uxtw #1] + sub x3, x5, w3, uxtw + sub x5, x5, w7, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_left_h4): + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w4): + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt L(ipred_dc_left_w4) + ret + +L(ipred_dc_left_h8): + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x3 +L(ipred_dc_left_w8): + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt L(ipred_dc_left_w8) + ret + +L(ipred_dc_left_h16): + ld1 {v0.8h, v1.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addv h0, v0.8h + urshr v2.4h, v0.4h, #4 + dup v0.8h, v2.h[0] + dup v1.8h, v2.h[0] + br x3 +L(ipred_dc_left_w16): + mov v1.16b, v0.16b +1: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h32): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2] + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlp v0.4s, v0.8h + addv s0, v0.4s + rshrn v4.4h, v0.4s, #5 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w32): + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_h64): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + rshrn v4.4h, v0.4s, #6 + dup v0.8h, v4.h[0] + br x3 +L(ipred_dc_left_w64): + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + sub x1, x1, #64 +1: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 1b + ret + +L(ipred_dc_left_tbl): + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) + .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) +endfunc + +// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_dc_16bpc_neon, export=1 + sub x2, x2, w4, uxtw #1 + add w7, w3, w4 // width + height + clz w3, w3 + clz w6, w4 + dup v16.4s, w7 // width + height + adr x5, L(ipred_dc_tbl) + rbit w7, w7 // rbit(width + height) + sub w3, w3, #20 // 25 leading bits, minus table offset 5 + sub w6, w6, #25 + clz w7, w7 // ctz(width + height) + ldrh w3, [x5, w3, uxtw #1] + ldrh w6, [x5, w6, uxtw #1] + neg w7, w7 // -ctz(width + height) + sub x3, x5, w3, uxtw + sub x5, x5, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w7 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + br x5 + +L(ipred_dc_h4): + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + br x3 +L(ipred_dc_w4): + add x2, x2, #2 + ld1 {v1.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.4h + cmp w4, #4 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.4h, v0.h[0] +2: + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.4h}, [x0], x1 + st1 {v0.4h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h8): + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w8): + add x2, x2, #2 + ld1 {v1.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s1, v1.8h + cmp w4, #8 + add v0.2s, v0.2s, v1.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] +2: + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h}, [x0], x1 + st1 {v0.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h16): + ld1 {v0.8h, v1.8h}, [x2], #32 + addp v0.8h, v0.8h, v1.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w16): + add x2, x2, #2 + ld1 {v1.8h, v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + uaddlv s1, v1.8h + cmp w4, #16 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32/64 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], x1 + st1 {v0.8h, v1.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h32): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v0.8h, v0.8h, v2.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w32): + add x2, x2, #2 + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + addp v3.8h, v3.8h, v4.8h + addp v1.8h, v1.8h, v3.8h + uaddlv s1, v1.8h + cmp w4, #32 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16/64 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_h64): + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64 + addp v0.8h, v0.8h, v1.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + addp v0.8h, v0.8h, v2.8h + addp v4.8h, v4.8h, v6.8h + addp v0.8h, v0.8h, v4.8h + uaddlv s0, v0.8h + br x3 +L(ipred_dc_w64): + add x2, x2, #2 + ld1 {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64 + add v0.2s, v0.2s, v16.2s + addp v1.8h, v1.8h, v2.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x2] + addp v3.8h, v3.8h, v4.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + addp v1.8h, v1.8h, v3.8h + addp v20.8h, v20.8h, v22.8h + addp v1.8h, v1.8h, v20.8h + uaddlv s1, v1.8h + cmp w4, #64 + add v0.2s, v0.2s, v1.2s + ushl v4.2s, v0.2s, v17.2s + b.eq 1f + // h = 16/32 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v4.2s, v4.2s, v16.2s + ushr v4.2s, v4.2s, #17 +1: + sub x1, x1, #64 + dup v0.8h, v4.h[0] + dup v1.8h, v4.h[0] + dup v2.8h, v4.h[0] + dup v3.8h, v4.h[0] +2: + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1 + b.gt 2b + ret + +L(ipred_dc_tbl): + .hword L(ipred_dc_tbl) - L(ipred_dc_h64) + .hword L(ipred_dc_tbl) - L(ipred_dc_h32) + .hword L(ipred_dc_tbl) - L(ipred_dc_h16) + .hword L(ipred_dc_tbl) - L(ipred_dc_h8) + .hword L(ipred_dc_tbl) - L(ipred_dc_h4) + .hword L(ipred_dc_tbl) - L(ipred_dc_w64) + .hword L(ipred_dc_tbl) - L(ipred_dc_w32) + .hword L(ipred_dc_tbl) - L(ipred_dc_w16) + .hword L(ipred_dc_tbl) - L(ipred_dc_w8) + .hword L(ipred_dc_tbl) - L(ipred_dc_w4) +endfunc + +// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_paeth_16bpc_neon, export=1 + clz w9, w3 + adr x5, L(ipred_paeth_tbl) + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x2] + add x8, x2, #2 + sub x2, x2, #8 + sub x5, x5, w9, uxtw + mov x7, #-8 + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v5.2d}, [x8] + sub v6.8h, v5.8h, v4.8h // top - topleft +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 + zip1 v0.2d, v0.2d, v1.2d + zip1 v2.2d, v2.2d, v3.2d + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v2.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v4.8h, v16.8h // tldiff + sabd v23.8h, v4.8h, v17.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v2.8h, v17.8h + umin v18.8h, v20.8h, v22.8h // min(tdiff, tldiff) + umin v19.8h, v21.8h, v23.8h + cmge v20.8h, v22.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v23.8h, v21.8h + cmge v16.8h, v18.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v19.8h, v17.8h + bsl v21.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v20.16b, v5.16b, v4.16b + bit v21.16b, v2.16b, v17.16b // ldiff <= min ? left : ... + bit v20.16b, v0.16b, v16.16b + st1 {v21.d}[1], [x0], x1 + st1 {v21.d}[0], [x6], x1 + subs w4, w4, #4 + st1 {v20.d}[1], [x0], x1 + st1 {v20.d}[0], [x6], x1 + b.gt 4b + ret +80: +160: +320: +640: + ld1 {v5.8h}, [x8], #16 + mov w9, w3 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 +2: + sub v6.8h, v5.8h, v4.8h // top - topleft + add v16.8h, v6.8h, v0.8h // base + add v17.8h, v6.8h, v1.8h + add v18.8h, v6.8h, v2.8h + add v19.8h, v6.8h, v3.8h + sabd v20.8h, v5.8h, v16.8h // tdiff + sabd v21.8h, v5.8h, v17.8h + sabd v22.8h, v5.8h, v18.8h + sabd v23.8h, v5.8h, v19.8h + sabd v24.8h, v4.8h, v16.8h // tldiff + sabd v25.8h, v4.8h, v17.8h + sabd v26.8h, v4.8h, v18.8h + sabd v27.8h, v4.8h, v19.8h + sabd v16.8h, v0.8h, v16.8h // ldiff + sabd v17.8h, v1.8h, v17.8h + sabd v18.8h, v2.8h, v18.8h + sabd v19.8h, v3.8h, v19.8h + umin v28.8h, v20.8h, v24.8h // min(tdiff, tldiff) + umin v29.8h, v21.8h, v25.8h + umin v30.8h, v22.8h, v26.8h + umin v31.8h, v23.8h, v27.8h + cmge v20.8h, v24.8h, v20.8h // tldiff >= tdiff + cmge v21.8h, v25.8h, v21.8h + cmge v22.8h, v26.8h, v22.8h + cmge v23.8h, v27.8h, v23.8h + cmge v16.8h, v28.8h, v16.8h // min(tdiff, tldiff) >= ldiff + cmge v17.8h, v29.8h, v17.8h + cmge v18.8h, v30.8h, v18.8h + cmge v19.8h, v31.8h, v19.8h + bsl v23.16b, v5.16b, v4.16b // tdiff <= tldiff ? top : topleft + bsl v22.16b, v5.16b, v4.16b + bsl v21.16b, v5.16b, v4.16b + bsl v20.16b, v5.16b, v4.16b + bit v23.16b, v3.16b, v19.16b // ldiff <= min ? left : ... + bit v22.16b, v2.16b, v18.16b + bit v21.16b, v1.16b, v17.16b + bit v20.16b, v0.16b, v16.16b + st1 {v23.8h}, [x0], #16 + st1 {v22.8h}, [x6], #16 + subs w3, w3, #8 + st1 {v21.8h}, [x5], #16 + st1 {v20.8h}, [x10], #16 + b.le 8f + ld1 {v5.8h}, [x8], #16 + b 2b +8: + subs w4, w4, #4 + b.le 9f + // End of horizontal loop, move pointers to next four rows + sub x8, x8, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + // Load the top row as early as possible + ld1 {v5.8h}, [x8], #16 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_paeth_tbl): + .hword L(ipred_paeth_tbl) - 640b + .hword L(ipred_paeth_tbl) - 320b + .hword L(ipred_paeth_tbl) - 160b + .hword L(ipred_paeth_tbl) - 80b + .hword L(ipred_paeth_tbl) - 40b +endfunc + +// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_16bpc_neon, export=1 + movrel x10, X(sm_weights) + add x11, x10, w4, uxtw + add x10, x10, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_tbl) + sub x12, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x12] // bottom + add x8, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + sub x2, x2, #8 + mov x7, #-8 + ld1r {v6.2d}, [x8] // top + ld1r {v7.2s}, [x10] // weights_hor + dup v5.8h, v6.h[3] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v18.8h, v18.8b + smlal v20.4s, v0.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v0.8h, v7.8h + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v18.4h + smlal2 v23.4s, v6.8h, v18.8h + rshrn v20.4h, v20.4s, #9 + rshrn v21.4h, v21.4s, #9 + rshrn v22.4h, v22.4s, #9 + rshrn v23.4h, v23.4s, #9 + st1 {v20.4h}, [x0], x1 + st1 {v21.4h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.4h}, [x0], x1 + st1 {v23.4h}, [x6], x1 + b.gt 4b + ret +80: + sub x2, x2, #8 + mov x7, #-8 + ld1 {v6.8h}, [x8] // top + ld1 {v7.8b}, [x10] // weights_hor + dup v5.8h, v6.h[7] // right + sub v6.8h, v6.8h, v4.8h // top-bottom + uxtl v7.8h, v7.8b // weights_hor + add v31.4h, v4.4h, v5.4h // bottom+right +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x11], #4 // weights_ver + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b + uxtl v18.8h, v18.8b + uxtl v19.8h, v19.8b + smlal v20.4s, v3.4h, v7.4h // += (left-right)*weights_hor + smlal2 v21.4s, v3.8h, v7.8h // (left flipped) + smlal v22.4s, v2.4h, v7.4h + smlal2 v23.4s, v2.8h, v7.8h + smlal v24.4s, v1.4h, v7.4h + smlal2 v25.4s, v1.8h, v7.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v6.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v6.8h, v16.8h + smlal v22.4s, v6.4h, v17.4h + smlal2 v23.4s, v6.8h, v17.8h + smlal v24.4s, v6.4h, v18.4h + smlal2 v25.4s, v6.8h, v18.8h + smlal v26.4s, v6.4h, v19.4h + smlal2 v27.4s, v6.8h, v19.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + add x12, x2, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + ld1r {v5.8h}, [x12] // right + sub x2, x2, #4 + mov x7, #-4 + mov w9, w3 + add v31.4h, v4.4h, v5.4h // bottom+right + +1: + ld2r {v0.8h, v1.8h}, [x2], x7 // left + ld2r {v16.8b, v17.8b}, [x11], #2 // weights_ver + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + uxtl v16.8h, v16.8b // weights_ver + uxtl v17.8h, v17.8b +2: + ld1 {v7.16b}, [x10], #16 // weights_hor + ld1 {v2.8h, v3.8h}, [x8], #32 // top + ushll v20.4s, v31.4h, #8 // (bottom+right)*256 + ushll v21.4s, v31.4h, #8 + ushll v22.4s, v31.4h, #8 + ushll v23.4s, v31.4h, #8 + ushll v24.4s, v31.4h, #8 + ushll v25.4s, v31.4h, #8 + ushll v26.4s, v31.4h, #8 + ushll v27.4s, v31.4h, #8 + uxtl v6.8h, v7.8b // weights_hor + uxtl2 v7.8h, v7.16b + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + smlal v20.4s, v1.4h, v6.4h // += (left-right)*weights_hor + smlal2 v21.4s, v1.8h, v6.8h // (left flipped) + smlal v22.4s, v1.4h, v7.4h + smlal2 v23.4s, v1.8h, v7.8h + smlal v24.4s, v0.4h, v6.4h + smlal2 v25.4s, v0.8h, v6.8h + smlal v26.4s, v0.4h, v7.4h + smlal2 v27.4s, v0.8h, v7.8h + smlal v20.4s, v2.4h, v16.4h // += (top-bottom)*weights_ver + smlal2 v21.4s, v2.8h, v16.8h + smlal v22.4s, v3.4h, v16.4h + smlal2 v23.4s, v3.8h, v16.8h + smlal v24.4s, v2.4h, v17.4h + smlal2 v25.4s, v2.8h, v17.8h + smlal v26.4s, v3.4h, v17.4h + smlal2 v27.4s, v3.8h, v17.8h + rshrn v20.4h, v20.4s, #9 + rshrn2 v20.8h, v21.4s, #9 + rshrn v21.4h, v22.4s, #9 + rshrn2 v21.8h, v23.4s, #9 + rshrn v22.4h, v24.4s, #9 + rshrn2 v22.8h, v25.4s, #9 + rshrn v23.4h, v26.4s, #9 + rshrn2 v23.8h, v27.4s, #9 + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x8, w9, uxtw #1 + sub x10, x10, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_tbl): + .hword L(ipred_smooth_tbl) - 640b + .hword L(ipred_smooth_tbl) - 320b + .hword L(ipred_smooth_tbl) - 160b + .hword L(ipred_smooth_tbl) - 80b + .hword L(ipred_smooth_tbl) - 40b +endfunc + +// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_v_16bpc_neon, export=1 + movrel x7, X(sm_weights) + add x7, x7, w4, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_v_tbl) + sub x8, x2, w4, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v4.8h}, [x8] // bottom + add x2, x2, #2 + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v6.2d}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +4: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + zip1 v16.2s, v16.2s, v17.2s // weights_ver + zip1 v18.2s, v18.2s, v19.2s + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v18.8h, v18.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v18.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v6.8h}, [x2] // top + sub v6.8h, v6.8h, v4.8h // top-bottom +8: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 + sqrdmulh v20.8h, v6.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v6.8h, v17.8h + sqrdmulh v22.8h, v6.8h, v18.8h + sqrdmulh v23.8h, v6.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + // Set up pointers for four rows in parallel; x0, x6, x5, x8 + add x5, x0, x1 + add x8, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver + ushll v16.8h, v16.8b, #7 // weights_ver << 7 + ushll v17.8h, v17.8b, #7 + ushll v18.8h, v18.8b, #7 + ushll v19.8h, v19.8b, #7 +2: + ld1 {v2.8h, v3.8h}, [x2], #32 // top + sub v2.8h, v2.8h, v4.8h // top-bottom + sub v3.8h, v3.8h, v4.8h + sqrdmulh v20.8h, v2.8h, v16.8h // ((top-bottom)*weights_ver + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v16.8h + sqrdmulh v22.8h, v2.8h, v17.8h + sqrdmulh v23.8h, v3.8h, v17.8h + sqrdmulh v24.8h, v2.8h, v18.8h + sqrdmulh v25.8h, v3.8h, v18.8h + sqrdmulh v26.8h, v2.8h, v19.8h + sqrdmulh v27.8h, v3.8h, v19.8h + add v20.8h, v20.8h, v4.8h + add v21.8h, v21.8h, v4.8h + add v22.8h, v22.8h, v4.8h + add v23.8h, v23.8h, v4.8h + add v24.8h, v24.8h, v4.8h + add v25.8h, v25.8h, v4.8h + add v26.8h, v26.8h, v4.8h + add v27.8h, v27.8h, v4.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x8], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x2, x2, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x8, x8, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_v_tbl): + .hword L(ipred_smooth_v_tbl) - 640b + .hword L(ipred_smooth_v_tbl) - 320b + .hword L(ipred_smooth_v_tbl) - 160b + .hword L(ipred_smooth_v_tbl) - 80b + .hword L(ipred_smooth_v_tbl) - 40b +endfunc + +// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int a, +// const int max_width, const int max_height); +function ipred_smooth_h_16bpc_neon, export=1 + movrel x8, X(sm_weights) + add x8, x8, w3, uxtw + clz w9, w3 + adr x5, L(ipred_smooth_h_tbl) + add x12, x2, w3, uxtw #1 + sub w9, w9, #25 + ldrh w9, [x5, w9, uxtw #1] + ld1r {v5.8h}, [x12] // right + sub x5, x5, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + br x5 +40: + ld1r {v7.2s}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +4: + ld4r {v0.4h, v1.4h, v2.4h, v3.4h}, [x2], x7 // left + zip1 v1.2d, v1.2d, v0.2d // left, flipped + zip1 v0.2d, v3.2d, v2.2d + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sqrdmulh v20.8h, v0.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v1.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + st1 {v20.d}[0], [x0], x1 + st1 {v20.d}[1], [x6], x1 + subs w4, w4, #4 + st1 {v21.d}[0], [x0], x1 + st1 {v21.d}[1], [x6], x1 + b.gt 4b + ret +80: + ld1 {v7.8b}, [x8] // weights_hor + sub x2, x2, #8 + mov x7, #-8 + ushll v7.8h, v7.8b, #7 // weights_hor << 7 +8: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v3.8h, v3.8h, v5.8h // left-right + sub v2.8h, v2.8h, v5.8h + sub v1.8h, v1.8h, v5.8h + sub v0.8h, v0.8h, v5.8h + sqrdmulh v20.8h, v3.8h, v7.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v2.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v1.8h, v7.8h + sqrdmulh v23.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + st1 {v20.8h}, [x0], x1 + st1 {v21.8h}, [x6], x1 + subs w4, w4, #4 + st1 {v22.8h}, [x0], x1 + st1 {v23.8h}, [x6], x1 + b.gt 8b + ret +160: +320: +640: + sub x2, x2, #8 + mov x7, #-8 + // Set up pointers for four rows in parallel; x0, x6, x5, x10 + add x5, x0, x1 + add x10, x6, x1 + lsl x1, x1, #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld4r {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], x7 // left + sub v0.8h, v0.8h, v5.8h // left-right + sub v1.8h, v1.8h, v5.8h + sub v2.8h, v2.8h, v5.8h + sub v3.8h, v3.8h, v5.8h +2: + ld1 {v7.16b}, [x8], #16 // weights_hor + ushll v6.8h, v7.8b, #7 // weights_hor << 7 + ushll2 v7.8h, v7.16b, #7 + sqrdmulh v20.8h, v3.8h, v6.8h // ((left-right)*weights_hor + 128) >> 8 + sqrdmulh v21.8h, v3.8h, v7.8h // (left flipped) + sqrdmulh v22.8h, v2.8h, v6.8h + sqrdmulh v23.8h, v2.8h, v7.8h + sqrdmulh v24.8h, v1.8h, v6.8h + sqrdmulh v25.8h, v1.8h, v7.8h + sqrdmulh v26.8h, v0.8h, v6.8h + sqrdmulh v27.8h, v0.8h, v7.8h + add v20.8h, v20.8h, v5.8h + add v21.8h, v21.8h, v5.8h + add v22.8h, v22.8h, v5.8h + add v23.8h, v23.8h, v5.8h + add v24.8h, v24.8h, v5.8h + add v25.8h, v25.8h, v5.8h + add v26.8h, v26.8h, v5.8h + add v27.8h, v27.8h, v5.8h + subs w3, w3, #16 + st1 {v20.8h, v21.8h}, [x0], #32 + st1 {v22.8h, v23.8h}, [x6], #32 + st1 {v24.8h, v25.8h}, [x5], #32 + st1 {v26.8h, v27.8h}, [x10], #32 + b.gt 2b + subs w4, w4, #4 + b.le 9f + sub x8, x8, w9, uxtw + add x0, x0, x1 + add x6, x6, x1 + add x5, x5, x1 + add x10, x10, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_smooth_h_tbl): + .hword L(ipred_smooth_h_tbl) - 640b + .hword L(ipred_smooth_h_tbl) - 320b + .hword L(ipred_smooth_h_tbl) - 160b + .hword L(ipred_smooth_h_tbl) - 80b + .hword L(ipred_smooth_h_tbl) - 40b +endfunc + +// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, const int filt_idx, +// const int max_width, const int max_height, +// const int bitdepth_max); +.macro filter_fn bpc +function ipred_filter_\bpc\()bpc_neon + and w5, w5, #511 + movrel x6, X(filter_intra_taps) + lsl w5, w5, #6 + add x6, x6, w5, uxtw + ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 + clz w9, w3 + adr x5, L(ipred_filter\bpc\()_tbl) + ld1 {v20.8b, v21.8b, v22.8b}, [x6] + sub w9, w9, #26 + ldrh w9, [x5, w9, uxtw #1] + sxtl v16.8h, v16.8b + sxtl v17.8h, v17.8b + sub x5, x5, w9, uxtw + sxtl v18.8h, v18.8b + sxtl v19.8h, v19.8b + add x6, x0, x1 + lsl x1, x1, #1 + sxtl v20.8h, v20.8b + sxtl v21.8h, v21.8b + sxtl v22.8h, v22.8b + dup v31.8h, w8 + movi v30.8h, #0 + br x5 +40: + ldur d0, [x2, #2] // top (0-3) + sub x2, x2, #4 + mov x7, #-4 +4: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 +.endif + smin v2.8h, v2.8h, v31.8h + subs w4, w4, #2 + st1 {v2.d}[0], [x0], x1 + uxtl v0.8h, v2.8b + ext v0.16b, v2.16b, v2.16b, #8 // move top from [4-7] to [0-3] + st1 {v2.d}[1], [x6], x1 + b.gt 4b + ret +80: + ldur q0, [x2, #2] // top (0-7) + sub x2, x2, #4 + mov x7, #-4 +8: + ld1 {v1.4h}, [x2], x7 // left (0-1) + topleft (2) +.if \bpc == 10 + mul v2.8h, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + mla v2.8h, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + mla v2.8h, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + mla v2.8h, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + mla v2.8h, v16.8h, v1.h[2] // p0(topleft) * filter(0) + mla v2.8h, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + mla v2.8h, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + mul v3.8h, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + srshr v2.8h, v2.8h, #4 + smax v2.8h, v2.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + mla v3.8h, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + mla v3.8h, v16.8h, v0.h[3] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h +.else + smull v2.4s, v17.4h, v0.h[0] // p1(top[0]) * filter(1) + smlal v2.4s, v18.4h, v0.h[1] // p2(top[1]) * filter(2) + smlal v2.4s, v19.4h, v0.h[2] // p3(top[2]) * filter(3) + smlal v2.4s, v20.4h, v0.h[3] // p4(top[3]) * filter(4) + smlal v2.4s, v16.4h, v1.h[2] // p0(topleft) * filter(0) + smlal v2.4s, v21.4h, v1.h[1] // p5(left[0]) * filter(5) + smlal v2.4s, v22.4h, v1.h[0] // p6(left[1]) * filter(6) + smull2 v3.4s, v17.8h, v0.h[0] // p1(top[0]) * filter(1) + smlal2 v3.4s, v18.8h, v0.h[1] // p2(top[1]) * filter(2) + smlal2 v3.4s, v19.8h, v0.h[2] // p3(top[2]) * filter(3) + smlal2 v3.4s, v20.8h, v0.h[3] // p4(top[3]) * filter(4) + smlal2 v3.4s, v16.8h, v1.h[2] // p0(topleft) * filter(0) + smlal2 v3.4s, v21.8h, v1.h[1] // p5(left[0]) * filter(5) + smlal2 v3.4s, v22.8h, v1.h[0] // p6(left[1]) * filter(6) + smull v4.4s, v17.4h, v0.h[4] // p1(top[0]) * filter(1) + smlal v4.4s, v18.4h, v0.h[5] // p2(top[1]) * filter(2) + smlal v4.4s, v19.4h, v0.h[6] // p3(top[2]) * filter(3) + sqrshrun v2.4h, v2.4s, #4 + sqrshrun2 v2.8h, v3.4s, #4 + smin v2.8h, v2.8h, v31.8h + smlal v4.4s, v20.4h, v0.h[7] // p4(top[3]) * filter(4) + smlal v4.4s, v16.4h, v0.h[3] // p0(topleft) * filter(0) + smlal v4.4s, v21.4h, v2.h[3] // p5(left[0]) * filter(5) + smlal v4.4s, v22.4h, v2.h[7] // p6(left[1]) * filter(6) + smull2 v5.4s, v17.8h, v0.h[4] // p1(top[0]) * filter(1) + smlal2 v5.4s, v18.8h, v0.h[5] // p2(top[1]) * filter(2) + smlal2 v5.4s, v19.8h, v0.h[6] // p3(top[2]) * filter(3) + smlal2 v5.4s, v20.8h, v0.h[7] // p4(top[3]) * filter(4) + smlal2 v5.4s, v16.8h, v0.h[3] // p0(topleft) * filter(0) + smlal2 v5.4s, v21.8h, v2.h[3] // p5(left[0]) * filter(5) + smlal2 v5.4s, v22.8h, v2.h[7] // p6(left[1]) * filter(6) + sqrshrun v3.4h, v4.4s, #4 + sqrshrun2 v3.8h, v5.4s, #4 +.endif + smin v3.8h, v3.8h, v31.8h + subs w4, w4, #2 + st2 {v2.d, v3.d}[0], [x0], x1 + zip2 v0.2d, v2.2d, v3.2d + st2 {v2.d, v3.d}[1], [x6], x1 + b.gt 8b + ret +160: +320: + add x8, x2, #2 + sub x2, x2, #4 + mov x7, #-4 + sub x1, x1, w3, uxtw #1 + mov w9, w3 + +1: + ld1 {v0.4h}, [x2], x7 // left (0-1) + topleft (2) +2: + ld1 {v1.8h, v2.8h}, [x8], #32 // top(0-15) +.if \bpc == 10 + mul v3.8h, v16.8h, v0.h[2] // p0(topleft) * filter(0) + mla v3.8h, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + mla v3.8h, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + mla v3.8h, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + mla v3.8h, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + mla v3.8h, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + mla v3.8h, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + mul v4.8h, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + mla v4.8h, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + mla v4.8h, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + srshr v3.8h, v3.8h, #4 + smax v3.8h, v3.8h, v30.8h + smin v3.8h, v3.8h, v31.8h + mla v4.8h, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + mla v4.8h, v16.8h, v1.h[3] // p0(topleft) * filter(0) + mla v4.8h, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + mla v4.8h, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + mul v5.8h, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + mla v5.8h, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + mla v5.8h, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + srshr v4.8h, v4.8h, #4 + smax v4.8h, v4.8h, v30.8h + smin v4.8h, v4.8h, v31.8h + mla v5.8h, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + mla v5.8h, v16.8h, v1.h[7] // p0(topleft) * filter(0) + mla v5.8h, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + mla v5.8h, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + mul v6.8h, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + mla v6.8h, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + mla v6.8h, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + srshr v5.8h, v5.8h, #4 + smax v5.8h, v5.8h, v30.8h + smin v5.8h, v5.8h, v31.8h + mla v6.8h, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + mla v6.8h, v16.8h, v2.h[3] // p0(topleft) * filter(0) + mla v6.8h, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + mla v6.8h, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + srshr v6.8h, v6.8h, #4 + smax v6.8h, v6.8h, v30.8h +.else + smull v3.4s, v16.4h, v0.h[2] // p0(topleft) * filter(0) + smlal v3.4s, v21.4h, v0.h[1] // p5(left[0]) * filter(5) + smlal v3.4s, v22.4h, v0.h[0] // p6(left[1]) * filter(6) + smlal v3.4s, v17.4h, v1.h[0] // p1(top[0]) * filter(1) + smlal v3.4s, v18.4h, v1.h[1] // p2(top[1]) * filter(2) + smlal v3.4s, v19.4h, v1.h[2] // p3(top[2]) * filter(3) + smlal v3.4s, v20.4h, v1.h[3] // p4(top[3]) * filter(4) + smull2 v4.4s, v16.8h, v0.h[2] // p0(topleft) * filter(0) + smlal2 v4.4s, v21.8h, v0.h[1] // p5(left[0]) * filter(5) + smlal2 v4.4s, v22.8h, v0.h[0] // p6(left[1]) * filter(6) + smlal2 v4.4s, v17.8h, v1.h[0] // p1(top[0]) * filter(1) + smlal2 v4.4s, v18.8h, v1.h[1] // p2(top[1]) * filter(2) + smlal2 v4.4s, v19.8h, v1.h[2] // p3(top[2]) * filter(3) + smlal2 v4.4s, v20.8h, v1.h[3] // p4(top[3]) * filter(4) + + smull v5.4s, v17.4h, v1.h[4] // p1(top[0]) * filter(1) + smlal v5.4s, v18.4h, v1.h[5] // p2(top[1]) * filter(2) + smlal v5.4s, v19.4h, v1.h[6] // p3(top[2]) * filter(3) + sqrshrun v3.4h, v3.4s, #4 + sqrshrun2 v3.8h, v4.4s, #4 + smin v3.8h, v3.8h, v31.8h + smlal v5.4s, v20.4h, v1.h[7] // p4(top[3]) * filter(4) + smlal v5.4s, v16.4h, v1.h[3] // p0(topleft) * filter(0) + smlal v5.4s, v21.4h, v3.h[3] // p5(left[0]) * filter(5) + smlal v5.4s, v22.4h, v3.h[7] // p6(left[1]) * filter(6) + smull2 v6.4s, v17.8h, v1.h[4] // p1(top[0]) * filter(1) + smlal2 v6.4s, v18.8h, v1.h[5] // p2(top[1]) * filter(2) + smlal2 v6.4s, v19.8h, v1.h[6] // p3(top[2]) * filter(3) + smlal2 v6.4s, v20.8h, v1.h[7] // p4(top[3]) * filter(4) + smlal2 v6.4s, v16.8h, v1.h[3] // p0(topleft) * filter(0) + smlal2 v6.4s, v21.8h, v3.h[3] // p5(left[0]) * filter(5) + smlal2 v6.4s, v22.8h, v3.h[7] // p6(left[1]) * filter(6) + + smull v24.4s, v17.4h, v2.h[0] // p1(top[0]) * filter(1) + smlal v24.4s, v18.4h, v2.h[1] // p2(top[1]) * filter(2) + smlal v24.4s, v19.4h, v2.h[2] // p3(top[2]) * filter(3) + sqrshrun v4.4h, v5.4s, #4 + sqrshrun2 v4.8h, v6.4s, #4 + smin v4.8h, v4.8h, v31.8h + smlal v24.4s, v20.4h, v2.h[3] // p4(top[3]) * filter(4) + smlal v24.4s, v16.4h, v1.h[7] // p0(topleft) * filter(0) + smlal v24.4s, v21.4h, v4.h[3] // p5(left[0]) * filter(5) + smlal v24.4s, v22.4h, v4.h[7] // p6(left[1]) * filter(6) + smull2 v25.4s, v17.8h, v2.h[0] // p1(top[0]) * filter(1) + smlal2 v25.4s, v18.8h, v2.h[1] // p2(top[1]) * filter(2) + smlal2 v25.4s, v19.8h, v2.h[2] // p3(top[2]) * filter(3) + smlal2 v25.4s, v20.8h, v2.h[3] // p4(top[3]) * filter(4) + smlal2 v25.4s, v16.8h, v1.h[7] // p0(topleft) * filter(0) + smlal2 v25.4s, v21.8h, v4.h[3] // p5(left[0]) * filter(5) + smlal2 v25.4s, v22.8h, v4.h[7] // p6(left[1]) * filter(6) + + smull v26.4s, v17.4h, v2.h[4] // p1(top[0]) * filter(1) + smlal v26.4s, v18.4h, v2.h[5] // p2(top[1]) * filter(2) + smlal v26.4s, v19.4h, v2.h[6] // p3(top[2]) * filter(3) + sqrshrun v5.4h, v24.4s, #4 + sqrshrun2 v5.8h, v25.4s, #4 + smin v5.8h, v5.8h, v31.8h + smlal v26.4s, v20.4h, v2.h[7] // p4(top[3]) * filter(4) + smlal v26.4s, v16.4h, v2.h[3] // p0(topleft) * filter(0) + smlal v26.4s, v21.4h, v5.h[3] // p5(left[0]) * filter(5) + smlal v26.4s, v22.4h, v5.h[7] // p6(left[1]) * filter(6) + smull2 v27.4s, v17.8h, v2.h[4] // p1(top[0]) * filter(1) + smlal2 v27.4s, v18.8h, v2.h[5] // p2(top[1]) * filter(2) + smlal2 v27.4s, v19.8h, v2.h[6] // p3(top[2]) * filter(3) + smlal2 v27.4s, v20.8h, v2.h[7] // p4(top[3]) * filter(4) + smlal2 v27.4s, v16.8h, v2.h[3] // p0(topleft) * filter(0) + smlal2 v27.4s, v21.8h, v5.h[3] // p5(left[0]) * filter(5) + smlal2 v27.4s, v22.8h, v5.h[7] // p6(left[1]) * filter(6) + + subs w3, w3, #16 + sqrshrun v6.4h, v26.4s, #4 + sqrshrun2 v6.8h, v27.4s, #4 +.endif + smin v6.8h, v6.8h, v31.8h + + ins v0.h[2], v2.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32 + ins v0.h[0], v6.h[7] + st4 {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32 + ins v0.h[1], v6.h[3] + b.gt 2b + subs w4, w4, #2 + b.le 9f + sub x8, x6, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b 1b +9: + ret + +L(ipred_filter\bpc\()_tbl): + .hword L(ipred_filter\bpc\()_tbl) - 320b + .hword L(ipred_filter\bpc\()_tbl) - 160b + .hword L(ipred_filter\bpc\()_tbl) - 80b + .hword L(ipred_filter\bpc\()_tbl) - 40b +endfunc +.endm + +filter_fn 10 +filter_fn 12 + +function ipred_filter_16bpc_neon, export=1 + ldr w8, [sp] + cmp w8, 0x3ff + b.le ipred_filter_10bpc_neon + b ipred_filter_12bpc_neon +endfunc + +// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const uint16_t *const pal, const uint8_t *idx, +// const int w, const int h); +function pal_pred_16bpc_neon, export=1 + ld1 {v30.8h}, [x2] + clz w9, w4 + adr x6, L(pal_pred_tbl) + sub w9, w9, #25 + ldrh w9, [x6, w9, uxtw #1] + movi v31.8h, #1, lsl #8 + sub x6, x6, w9, uxtw + br x6 +40: + add x2, x0, x1 + lsl x1, x1, #1 +4: + ld1 {v1.16b}, [x3], #16 + subs w5, w5, #4 + // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ... + add v1.16b, v1.16b, v1.16b + zip1 v0.16b, v1.16b, v1.16b + zip2 v1.16b, v1.16b, v1.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + st1 {v0.d}[0], [x0], x1 + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.d}[1], [x2], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x2], x1 + b.gt 4b + ret +80: + add x2, x0, x1 + lsl x1, x1, #1 +8: + ld1 {v2.16b, v3.16b}, [x3], #32 + subs w5, w5, #4 + add v2.16b, v2.16b, v2.16b + add v3.16b, v3.16b, v3.16b + zip1 v0.16b, v2.16b, v2.16b + zip2 v1.16b, v2.16b, v2.16b + zip1 v2.16b, v3.16b, v3.16b + zip2 v3.16b, v3.16b, v3.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + tbl v1.16b, {v30.16b}, v1.16b + st1 {v0.8h}, [x0], x1 + tbl v2.16b, {v30.16b}, v2.16b + st1 {v1.8h}, [x2], x1 + tbl v3.16b, {v30.16b}, v3.16b + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x2], x1 + b.gt 8b + ret +160: + add x2, x0, x1 + lsl x1, x1, #1 +16: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #4 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + st1 {v2.8h, v3.8h}, [x2], x1 + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h}, [x0], x1 + st1 {v6.8h, v7.8h}, [x2], x1 + b.gt 16b + ret +320: + add x2, x0, x1 + lsl x1, x1, #1 +32: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #2 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 32b + ret +640: + add x2, x0, #64 +64: + ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64 + subs w5, w5, #1 + add v4.16b, v4.16b, v4.16b + add v5.16b, v5.16b, v5.16b + add v6.16b, v6.16b, v6.16b + add v7.16b, v7.16b, v7.16b + zip1 v0.16b, v4.16b, v4.16b + zip2 v1.16b, v4.16b, v4.16b + zip1 v2.16b, v5.16b, v5.16b + zip2 v3.16b, v5.16b, v5.16b + zip1 v4.16b, v6.16b, v6.16b + zip2 v5.16b, v6.16b, v6.16b + zip1 v6.16b, v7.16b, v7.16b + zip2 v7.16b, v7.16b, v7.16b + add v0.8h, v0.8h, v31.8h + add v1.8h, v1.8h, v31.8h + add v2.8h, v2.8h, v31.8h + add v3.8h, v3.8h, v31.8h + add v4.8h, v4.8h, v31.8h + tbl v0.16b, {v30.16b}, v0.16b + add v5.8h, v5.8h, v31.8h + tbl v1.16b, {v30.16b}, v1.16b + add v6.8h, v6.8h, v31.8h + tbl v2.16b, {v30.16b}, v2.16b + add v7.8h, v7.8h, v31.8h + tbl v3.16b, {v30.16b}, v3.16b + tbl v4.16b, {v30.16b}, v4.16b + tbl v5.16b, {v30.16b}, v5.16b + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + tbl v6.16b, {v30.16b}, v6.16b + tbl v7.16b, {v30.16b}, v7.16b + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1 + b.gt 64b + ret + +L(pal_pred_tbl): + .hword L(pal_pred_tbl) - 640b + .hword L(pal_pred_tbl) - 320b + .hword L(pal_pred_tbl) - 160b + .hword L(pal_pred_tbl) - 80b + .hword L(pal_pred_tbl) - 40b +endfunc + +// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_128_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_128_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + urshr v0.8h, v31.8h, #1 + dup v1.8h, w6 // alpha + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +L(ipred_cfl_splat_w4): + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #4 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v16.4s, v2.4s, #31 // sign = diff >> 31 + sshr v17.4s, v3.4s, #31 + sshr v18.4s, v4.4s, #31 + sshr v19.4s, v5.4s, #31 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x6], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v3.d}[1], [x6], x1 + b.gt L(ipred_cfl_splat_w4) + ret +L(ipred_cfl_splat_w8): + ld1 {v4.8h, v5.8h}, [x5], #32 + subs w4, w4, #2 + smull v2.4s, v4.4h, v1.4h // diff = ac * alpha + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v16.4s, v2.4s, #31 // sign = diff >> 31 + sshr v17.4s, v3.4s, #31 + sshr v18.4s, v4.4s, #31 + sshr v19.4s, v5.4s, #31 + add v2.4s, v2.4s, v16.4s // diff + sign + add v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v18.4s + add v5.4s, v5.4s, v19.4s + rshrn v2.4h, v2.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v2.8h, v3.4s, #6 + rshrn v3.4h, v4.4s, #6 + rshrn2 v3.8h, v5.4s, #6 + add v2.8h, v2.8h, v0.8h // dc + apply_sign() + add v3.8h, v3.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x6], x1 + b.gt L(ipred_cfl_splat_w8) + ret +L(ipred_cfl_splat_w16): + add x7, x5, w3, uxtw #1 + sub x1, x1, w3, uxtw #1 + mov w9, w3 +1: + ld1 {v2.8h, v3.8h}, [x5], #32 + ld1 {v4.8h, v5.8h}, [x7], #32 + subs w3, w3, #16 + smull v16.4s, v2.4h, v1.4h // diff = ac * alpha + smull2 v17.4s, v2.8h, v1.8h + smull v18.4s, v3.4h, v1.4h + smull2 v19.4s, v3.8h, v1.8h + smull v2.4s, v4.4h, v1.4h + smull2 v3.4s, v4.8h, v1.8h + smull v4.4s, v5.4h, v1.4h + smull2 v5.4s, v5.8h, v1.8h + sshr v20.4s, v16.4s, #31 // sign = diff >> 31 + sshr v21.4s, v17.4s, #31 + sshr v22.4s, v18.4s, #31 + sshr v23.4s, v19.4s, #31 + sshr v24.4s, v2.4s, #31 + sshr v25.4s, v3.4s, #31 + sshr v26.4s, v4.4s, #31 + sshr v27.4s, v5.4s, #31 + add v16.4s, v16.4s, v20.4s // diff + sign + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v23.4s + add v2.4s, v2.4s, v24.4s + add v3.4s, v3.4s, v25.4s + add v4.4s, v4.4s, v26.4s + add v5.4s, v5.4s, v27.4s + rshrn v16.4h, v16.4s, #6 // (diff + sign + 32) >> 6 = apply_sign() + rshrn2 v16.8h, v17.4s, #6 + rshrn v17.4h, v18.4s, #6 + rshrn2 v17.8h, v19.4s, #6 + rshrn v6.4h, v2.4s, #6 + rshrn2 v6.8h, v3.4s, #6 + rshrn v7.4h, v4.4s, #6 + rshrn2 v7.8h, v5.4s, #6 + add v2.8h, v16.8h, v0.8h // dc + apply_sign() + add v3.8h, v17.8h, v0.8h + add v4.8h, v6.8h, v0.8h + add v5.8h, v7.8h, v0.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], #32 + st1 {v4.8h, v5.8h}, [x6], #32 + b.gt 1b + subs w4, w4, #2 + add x5, x5, w9, uxtw #1 + add x7, x7, w9, uxtw #1 + add x0, x0, x1 + add x6, x6, x1 + mov w3, w9 + b.gt 1b + ret + +L(ipred_cfl_128_tbl): +L(ipred_cfl_splat_tbl): + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) + .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) +endfunc + +// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_top_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + clz w9, w3 + adr x7, L(ipred_cfl_top_tbl) + sub w9, w9, #26 + ldrh w9, [x7, w9, uxtw #1] + dup v1.8h, w6 // alpha + add x2, x2, #2 + sub x7, x7, w9, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 +4: + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) +8: + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) +16: + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) +32: + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_top_tbl): + .hword L(ipred_cfl_top_tbl) - 32b + .hword L(ipred_cfl_top_tbl) - 16b + .hword L(ipred_cfl_top_tbl) - 8b + .hword L(ipred_cfl_top_tbl) - 4b +endfunc + +// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_left_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + clz w9, w3 + clz w8, w4 + adr x10, L(ipred_cfl_splat_tbl) + adr x7, L(ipred_cfl_left_tbl) + sub w9, w9, #26 + sub w8, w8, #26 + ldrh w9, [x10, w9, uxtw #1] + ldrh w8, [x7, w8, uxtw #1] + dup v1.8h, w6 // alpha + sub x9, x10, w9, uxtw + sub x7, x7, w8, uxtw + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_left_h4): + ld1 {v0.4h}, [x2] + addv h0, v0.4h + urshr v0.4h, v0.4h, #2 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h8): + ld1 {v0.8h}, [x2] + addv h0, v0.8h + urshr v0.4h, v0.4h, #3 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h16): + ld1 {v2.8h, v3.8h}, [x2] + addp v0.8h, v2.8h, v3.8h + addv h0, v0.8h + urshr v0.4h, v0.4h, #4 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_h32): + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + rshrn v0.4h, v0.4s, #5 + dup v0.8h, v0.h[0] + br x9 + +L(ipred_cfl_left_tbl): + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) + .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) +endfunc + +// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride, +// const pixel *const topleft, +// const int width, const int height, +// const int16_t *ac, const int alpha, +// const int bitdepth_max); +function ipred_cfl_16bpc_neon, export=1 + dup v31.8h, w7 // bitdepth_max + sub x2, x2, w4, uxtw #1 + add w8, w3, w4 // width + height + dup v1.8h, w6 // alpha + clz w9, w3 + clz w6, w4 + dup v16.4s, w8 // width + height + adr x7, L(ipred_cfl_tbl) + rbit w8, w8 // rbit(width + height) + sub w9, w9, #22 // 22 leading bits, minus table offset 4 + sub w6, w6, #26 + clz w8, w8 // ctz(width + height) + ldrh w9, [x7, w9, uxtw #1] + ldrh w6, [x7, w6, uxtw #1] + neg w8, w8 // -ctz(width + height) + sub x9, x7, w9, uxtw + sub x7, x7, w6, uxtw + ushr v16.4s, v16.4s, #1 // (width + height) >> 1 + dup v17.4s, w8 // -ctz(width + height) + add x6, x0, x1 + lsl x1, x1, #1 + movi v30.8h, #0 + br x7 + +L(ipred_cfl_h4): + ld1 {v0.4h}, [x2], #8 + uaddlv s0, v0.4h + br x9 +L(ipred_cfl_w4): + add x2, x2, #2 + ld1 {v2.4h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.4h + cmp w4, #4 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #16 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w4) + +L(ipred_cfl_h8): + ld1 {v0.8h}, [x2], #16 + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w8): + add x2, x2, #2 + ld1 {v2.8h}, [x2] + add v0.2s, v0.2s, v16.2s + uaddlv s2, v2.8h + cmp w4, #8 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/16/32 + cmp w4, #32 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w8) + +L(ipred_cfl_h16): + ld1 {v2.8h, v3.8h}, [x2], #32 + addp v0.8h, v2.8h, v3.8h + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w16): + add x2, x2, #2 + ld1 {v2.8h, v3.8h}, [x2] + add v0.2s, v0.2s, v16.2s + addp v2.8h, v2.8h, v3.8h + uaddlv s2, v2.8h + cmp w4, #16 + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 4/8/32 + tst w4, #(32+16+8) // 16 added to make a consecutive bitmask + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_h32): + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v0.8h, v2.8h, v4.8h + uaddlv s0, v0.8h + br x9 +L(ipred_cfl_w32): + add x2, x2, #2 + ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [x2] + add v0.4s, v0.4s, v16.4s + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v2.8h, v2.8h, v4.8h + cmp w4, #32 + uaddlv s2, v2.8h + add v0.2s, v0.2s, v2.2s + ushl v0.2s, v0.2s, v17.2s + b.eq 1f + // h = 8/16 + cmp w4, #8 + mov w16, #0x6667 + mov w17, #0xAAAB + csel w16, w16, w17, eq + dup v16.2s, w16 + mul v0.2s, v0.2s, v16.2s + ushr v0.2s, v0.2s, #17 +1: + dup v0.8h, v0.h[0] + b L(ipred_cfl_splat_w16) + +L(ipred_cfl_tbl): + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) + .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) +endfunc + +// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_420_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_420_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_420_w4): +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + subs w8, w8, #2 + st1 {v0.8h}, [x0], #16 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + b.gt 1b + trn2 v1.2d, v0.2d, v0.2d + trn2 v0.2d, v0.2d, v0.2d +L(ipred_cfl_ac_420_w4_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: +L(ipred_cfl_ac_420_w4_calc_subtract_dc): + // Aggregate the sums + add v24.4s, v24.4s, v25.4s + add v26.4s, v26.4s, v27.4s + add v0.4s, v24.4s, v26.4s + addv s0, v0.4s // sum + sub x0, x0, w6, uxtw #3 + urshl v4.2s, v0.2s, v31.2s // (sum + (1 << (log2sz - 1))) >>= log2sz + dup v4.8h, v4.h[0] +6: // Subtract dc from ac + ld1 {v0.8h, v1.8h}, [x0] + subs w6, w6, #4 + sub v0.8h, v0.8h, v4.8h + sub v1.8h, v1.8h, v4.8h + st1 {v0.8h, v1.8h}, [x0], #32 + b.gt 6b + ret + +L(ipred_cfl_ac_420_w8): + cbnz w3, L(ipred_cfl_ac_420_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v4.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + mov v0.16b, v1.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_420_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v2.8h + addp v1.8h, v1.8h, v3.8h + add v0.8h, v0.8h, v1.8h + shl v0.8h, v0.8h, #1 + dup v1.4h, v0.h[3] + dup v3.4h, v0.h[7] + trn2 v2.2d, v0.2d, v0.2d + subs w8, w8, #2 + st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw v25.4s, v25.4s, v1.4h + uaddw v26.4s, v26.4s, v2.4h + uaddw v27.4s, v27.4s, v3.4h + b.gt 1b + trn1 v0.2d, v2.2d, v3.2d + trn1 v1.2d, v2.2d, v3.2d + +L(ipred_cfl_ac_420_w8_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 2b +3: + + // Double the height and reuse the w4 summing/subtracting + lsl w6, w6, #1 + lsl w9, w9, #1 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_w16): + adr x7, L(ipred_cfl_ac_420_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_420_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], x2 + add v0.8h, v0.8h, v4.8h + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2 + add v2.8h, v2.8h, v6.8h + addp v16.8h, v16.8h, v17.8h + addp v18.8h, v18.8h, v19.8h + addp v20.8h, v20.8h, v21.8h + addp v22.8h, v22.8h, v23.8h + add v16.8h, v16.8h, v20.8h + add v18.8h, v18.8h, v22.8h + shl v0.8h, v0.8h, #1 + shl v1.8h, v2.8h, #1 + shl v2.8h, v16.8h, #1 + shl v3.8h, v18.8h, #1 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q5, [x10, #32] + ld1 {v3.8h, v4.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v5.8h, v5.8h, v5.8h + addp v3.8h, v3.8h, v4.8h + ldr q18, [x1, #32] + add v2.4h, v2.4h, v5.4h + ld1 {v16.8h, v17.8h}, [x1], x2 + add v0.8h, v0.8h, v3.8h + ldr q21, [x10, #32] + ld1 {v19.8h, v20.8h}, [x10], x2 + addp v18.8h, v18.8h, v18.8h + addp v16.8h, v16.8h, v17.8h + addp v21.8h, v21.8h, v21.8h + addp v19.8h, v19.8h, v20.8h + add v18.4h, v18.4h, v21.4h + add v16.8h, v16.8h, v19.8h + shl v1.4h, v2.4h, #1 + shl v0.8h, v0.8h, #1 + shl v3.4h, v18.4h, #1 + shl v2.8h, v16.8h, #1 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + add v0.8h, v0.8h, v2.8h + add v4.8h, v4.8h, v6.8h + shl v0.8h, v0.8h, #1 + shl v2.8h, v4.8h, #1 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + ld1 {v4.8h}, [x1], x2 + ld1 {v6.8h}, [x10], x2 + addp v0.8h, v0.8h, v4.8h + addp v2.8h, v2.8h, v6.8h + add v0.8h, v0.8h, v2.8h + shl v0.8h, v0.8h, #1 + dup v1.8h, v0.h[3] + dup v3.8h, v0.h[7] + trn2 v2.2d, v0.2d, v3.2d + trn1 v0.2d, v0.2d, v1.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_420_w16_hpad): + cbz w4, 3f +2: // Vertical padding (h_pad > 0) + subs w4, w4, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 2b +3: + + // Quadruple the height and reuse the w4 summing/subtracting + lsl w6, w6, #2 + lsl w9, w9, #2 + b L(ipred_cfl_ac_420_w4_calc_subtract_dc) + +L(ipred_cfl_ac_420_tbl): + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) + .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) + .hword 0 + +L(ipred_cfl_ac_420_w16_tbl): + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) + .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) +endfunc + +// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx, +// const ptrdiff_t stride, const int w_pad, +// const int h_pad, const int cw, const int ch); +function ipred_cfl_ac_422_16bpc_neon, export=1 + clz w8, w5 + lsl w4, w4, #2 + adr x7, L(ipred_cfl_ac_422_tbl) + sub w8, w8, #27 + ldrh w8, [x7, w8, uxtw #1] + movi v24.4s, #0 + movi v25.4s, #0 + movi v26.4s, #0 + movi v27.4s, #0 + sub x7, x7, w8, uxtw + sub w8, w6, w4 // height - h_pad + rbit w9, w5 // rbit(width) + rbit w10, w6 // rbit(height) + clz w9, w9 // ctz(width) + clz w10, w10 // ctz(height) + add w9, w9, w10 // log2sz + add x10, x1, x2 + dup v31.4s, w9 + lsl x2, x2, #1 + neg v31.4s, v31.4s // -log2sz + br x7 + +L(ipred_cfl_ac_422_w4): +1: // Copy and subsample input + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h}, [x0], #32 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + b.gt 1b + trn2 v0.2d, v1.2d, v1.2d + trn2 v1.2d, v1.2d, v1.2d + b L(ipred_cfl_ac_420_w4_hpad) + +L(ipred_cfl_ac_422_w8): + cbnz w3, L(ipred_cfl_ac_422_w8_wpad) +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + ld1 {v4.8h, v5.8h}, [x1], x2 + addp v0.8h, v0.8h, v1.8h + ld1 {v6.8h, v7.8h}, [x10], x2 + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w8_wpad): +1: // Copy and subsample input, padding 4 + ld1 {v0.8h}, [x1], x2 + ld1 {v1.8h}, [x10], x2 + ld1 {v2.8h}, [x1], x2 + ld1 {v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v4.4h, v0.h[3] + dup v5.8h, v0.h[7] + dup v6.4h, v2.h[3] + dup v7.8h, v2.h[7] + trn2 v1.2d, v0.2d, v5.2d + trn1 v0.2d, v0.2d, v4.2d + trn2 v3.2d, v2.2d, v7.2d + trn1 v2.2d, v2.2d, v6.2d + subs w8, w8, #4 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v3.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w8_hpad) + +L(ipred_cfl_ac_422_w16): + adr x7, L(ipred_cfl_ac_422_w16_tbl) + ldrh w3, [x7, w3, uxtw #1] + sub x7, x7, w3, uxtw + br x7 + +L(ipred_cfl_ac_422_w16_wpad0): +1: // Copy and subsample input, without padding + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x2 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + addp v4.8h, v4.8h, v5.8h + addp v6.8h, v6.8h, v7.8h + shl v0.8h, v0.8h, #2 + shl v1.8h, v2.8h, #2 + shl v2.8h, v4.8h, #2 + shl v3.8h, v6.8h, #2 + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad1): +1: // Copy and subsample input, padding 4 + ldr q2, [x1, #32] + ld1 {v0.8h, v1.8h}, [x1], x2 + ldr q6, [x10, #32] + ld1 {v4.8h, v5.8h}, [x10], x2 + addp v2.8h, v2.8h, v2.8h + addp v0.8h, v0.8h, v1.8h + addp v6.8h, v6.8h, v6.8h + addp v4.8h, v4.8h, v5.8h + shl v1.4h, v2.4h, #2 + shl v0.8h, v0.8h, #2 + shl v3.4h, v6.4h, #2 + shl v2.8h, v4.8h, #2 + dup v4.4h, v1.h[3] + dup v5.4h, v3.h[3] + trn1 v1.2d, v1.2d, v4.2d + trn1 v3.2d, v3.2d, v5.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad2): +1: // Copy and subsample input, padding 8 + ld1 {v0.8h, v1.8h}, [x1], x2 + ld1 {v2.8h, v3.8h}, [x10], x2 + addp v0.8h, v0.8h, v1.8h + addp v2.8h, v2.8h, v3.8h + shl v0.8h, v0.8h, #2 + shl v2.8h, v2.8h, #2 + dup v1.8h, v0.h[7] + dup v3.8h, v2.h[7] + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_w16_wpad3): +1: // Copy and subsample input, padding 12 + ld1 {v0.8h}, [x1], x2 + ld1 {v2.8h}, [x10], x2 + addp v0.8h, v0.8h, v0.8h + addp v2.8h, v2.8h, v2.8h + shl v0.4h, v0.4h, #2 + shl v2.4h, v2.4h, #2 + dup v1.8h, v0.h[3] + dup v3.8h, v2.h[3] + trn1 v0.2d, v0.2d, v1.2d + trn1 v2.2d, v2.2d, v3.2d + subs w8, w8, #2 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + uaddw v24.4s, v24.4s, v0.4h + uaddw2 v25.4s, v25.4s, v0.8h + uaddw v26.4s, v26.4s, v1.4h + uaddw2 v27.4s, v27.4s, v1.8h + uaddw v24.4s, v24.4s, v2.4h + uaddw2 v25.4s, v25.4s, v2.8h + uaddw v26.4s, v26.4s, v3.4h + uaddw2 v27.4s, v27.4s, v3.8h + b.gt 1b + mov v0.16b, v2.16b + mov v1.16b, v3.16b + b L(ipred_cfl_ac_420_w16_hpad) + +L(ipred_cfl_ac_422_tbl): + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) + .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) + .hword 0 + +L(ipred_cfl_ac_422_w16_tbl): + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) + .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/64/itx.S b/ffmpeg/JNI/dav1d/src/arm/64/itx.S index b6c0c14aa..245af0e78 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/itx.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/itx.S @@ -58,7 +58,6 @@ // indicates only a quarter of input values are set, for idct16 and up, // a significant amount of calculation can be skipped, at the cost of more // code duplication and special casing. -// - Special case functions for e.g. more combinations with identity. const idct_coeffs, align=4 // idct4 @@ -106,7 +105,7 @@ const iadst8_coeffs, align=4 .short 4076, 401, 3612, 1931 .short 2598, 3166, 1189, 3920 // idct_coeffs - .short 2896, 2896*8, 1567, 3784, 0, 0, 0, 0 + .short 2896, 0, 1567, 3784, 0, 0, 0, 0 endconst const iadst16_coeffs, align=4 @@ -134,13 +133,6 @@ endconst .endif .endm -.macro smull_sz d0, d1, s0, c, sz - smull \d0\().4s, \s0\().4h, \c -.ifc \sz, .8h - smull2 \d1\().4s, \s0\().8h, \c -.endif -.endm - .macro rshrn_sz d0, s0, s1, shift, sz rshrn \d0\().4h, \s0\().4s, \shift .ifc \sz, .8h @@ -457,14 +449,14 @@ endfunc sqsub \r2\sz, v3\sz, v7\sz .endm -function inv_dct_4x4_neon +function inv_dct_4h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .4h ret endfunc -function inv_dct_8x4_neon +function inv_dct_8h_x4_neon, export=1 movrel x16, idct_coeffs ld1 {v0.4h}, [x16] idct_4 v16, v17, v18, v19, .8h @@ -497,12 +489,12 @@ endfunc rshrn \o3\().4h, \o3\().4s, #12 .endm -function inv_adst_4x4_neon +function inv_adst_4h_x4_neon, export=1 iadst_4x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_4x4_neon +function inv_flipadst_4h_x4_neon, export=1 iadst_4x4 v19, v18, v17, v16 ret endfunc @@ -563,17 +555,17 @@ endfunc rshrn2 \o3\().8h, v5.4s, #12 .endm -function inv_adst_8x4_neon +function inv_adst_8h_x4_neon, export=1 iadst_8x4 v16, v17, v18, v19 ret endfunc -function inv_flipadst_8x4_neon +function inv_flipadst_8h_x4_neon, export=1 iadst_8x4 v19, v18, v17, v16 ret endfunc -function inv_identity_4x4_neon +function inv_identity_4h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.4h, v16.4h, v0.h[0] @@ -587,7 +579,7 @@ function inv_identity_4x4_neon ret endfunc -function inv_identity_8x4_neon +function inv_identity_8h_x4_neon, export=1 mov w16, #(5793-4096)*8 dup v0.4h, w16 sqrdmulh v4.8h, v16.8h, v0.h[0] @@ -608,7 +600,7 @@ endfunc .endr .endm -function inv_txfm_add_wht_wht_4x4_neon, export=1 +function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1 mov x15, x30 movi v31.8h, #0 ld1 {v16.4h,v17.4h,v18.4h,v19.4h}, [x2] @@ -672,7 +664,7 @@ L(itx_4x4_end): endfunc .macro def_fn_4x4 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct @@ -692,8 +684,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1 b L(itx_4x4_end) 1: .endif - adr x4, inv_\txfm1\()_4x4_neon - adr x5, inv_\txfm2\()_4x4_neon + adr x4, inv_\txfm1\()_4h_x4_neon + adr x5, inv_\txfm2\()_4h_x4_neon b inv_txfm_add_4x4_neon endfunc .endm @@ -749,14 +741,14 @@ def_fn_4x4 identity, flipadst mov \r6\szb, v6\szb // out6 .endm -function inv_dct_8x8_neon +function inv_dct_8h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b ret endfunc -function inv_dct_4x8_neon +function inv_dct_4h_x8_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h}, [x16] idct_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b @@ -830,27 +822,27 @@ endfunc sqneg \o5\()\sz, v3\sz // out5 .endm -function inv_adst_8x8_neon +function inv_adst_8h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .8h ret endfunc -function inv_flipadst_8x8_neon +function inv_flipadst_8h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .8h ret endfunc -function inv_adst_4x8_neon +function inv_adst_4h_x8_neon, export=1 iadst_8 v16, v17, v18, v19, v20, v21, v22, v23, .4h ret endfunc -function inv_flipadst_4x8_neon +function inv_flipadst_4h_x8_neon, export=1 iadst_8 v23, v22, v21, v20, v19, v18, v17, v16, .4h ret endfunc -function inv_identity_8x8_neon +function inv_identity_8h_x8_neon, export=1 sqshl v16.8h, v16.8h, #1 sqshl v17.8h, v17.8h, #1 sqshl v18.8h, v18.8h, #1 @@ -862,7 +854,7 @@ function inv_identity_8x8_neon ret endfunc -function inv_identity_4x8_neon +function inv_identity_4h_x8_neon, export=1 sqshl v16.4h, v16.4h, #1 sqshl v17.4h, v17.4h, #1 sqshl v18.4h, v18.4h, #1 @@ -913,17 +905,17 @@ def_fn_8x8_base def_fn_8x8_base identity_ .macro def_fn_8x8 txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 8, 8, 1 .endif - adr x5, inv_\txfm2\()_8x8_neon + adr x5, inv_\txfm2\()_8h_x8_neon .ifc \txfm1, identity b inv_txfm_identity_add_8x8_neon .else - adr x4, inv_\txfm1\()_8x8_neon + adr x4, inv_\txfm1\()_8h_x8_neon b inv_txfm_add_8x8_neon .endif endfunc @@ -1000,14 +992,14 @@ function inv_txfm_add_4x8_neon endfunc .macro def_fn_48 w, h, txfm1, txfm2 -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 mov x15, x30 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 0 .endif - adr x4, inv_\txfm1\()_\h\()x\w\()_neon - adr x5, inv_\txfm2\()_\w\()x\h\()_neon + adr x4, inv_\txfm1\()_\h\()h_x\w\()_neon + adr x5, inv_\txfm2\()_\w\()h_x\h\()_neon b inv_txfm_add_\w\()x\h\()_neon endfunc .endm @@ -1118,14 +1110,14 @@ def_fns_48 8, 4 mov v22\szb, v3\szb .endm -function inv_dct_8x16_neon +function inv_dct_8h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .8h, .16b ret endfunc -function inv_dct_4x16_neon +function inv_dct_4h_x16_neon, export=1 movrel x16, idct_coeffs ld1 {v0.8h, v1.8h}, [x16] idct_16 .4h, .8b @@ -1302,27 +1294,27 @@ endfunc sqneg \o9\sz, v7\sz // out9 .endm -function inv_adst_8x16_neon +function inv_adst_8h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b ret endfunc -function inv_flipadst_8x16_neon +function inv_flipadst_8h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b ret endfunc -function inv_adst_4x16_neon +function inv_adst_4h_x16_neon, export=1 iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b ret endfunc -function inv_flipadst_4x16_neon +function inv_flipadst_4h_x16_neon, export=1 iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b ret endfunc -function inv_identity_8x16_neon +function inv_identity_8h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1333,7 +1325,7 @@ function inv_identity_8x16_neon ret endfunc -function inv_identity_4x16_neon +function inv_identity_4h_x16_neon, export=1 mov w16, #2*(5793-4096)*8 dup v0.4h, w16 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -1376,71 +1368,49 @@ endfunc .endr .endm -function inv_txfm_horz_16x8_neon +.macro def_horz_16 scale=0, identity=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x8_neon mov x14, x30 movi v7.8h, #0 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] - st1 {v7.8h}, [x7], x8 -.endr - blr x4 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - srshr v\i\().8h, v\i\().8h, #2 -.endr - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 - transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 - -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 -.endr - - br x14 -endfunc - -function inv_txfm_horz_identity_16x8_neon - mov x14, x30 - movi v7.8h, #0 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] - st1 {v7.8h}, [x7], x8 -.endr +.if \identity mov w16, #2*(5793-4096)*8 dup v0.4h, w16 - identity_8x16_shift2 v0.h[0] - transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 - transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 - -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 -.endr - - br x14 -endfunc - -function inv_txfm_horz_scale_16x8_neon - mov x14, x30 - movi v7.8h, #0 +.elseif \scale mov w16, #2896*8 dup v0.4h, w16 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - ld1 {v\i\().8h}, [x7] +.endif +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + ld1 {\i}, [x7] st1 {v7.8h}, [x7], x8 .endr +.if \scale scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif +.if \identity + identity_8x16_shift2 v0.h[0] +.else blr x4 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 - srshr v\i\().8h, v\i\().8h, #1 +.endif +.if \shift > 0 +.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h + srshr \i, \i, #\shift .endr +.endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 -.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31 - st1 {v\i\().8h}, [x6], #16 +.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h + st1 {\i}, [x6], #16 .endr br x14 endfunc +.endm + +def_horz_16 scale=0, identity=0, shift=2 +def_horz_16 scale=1, identity=0, shift=1, suffix=_scale +def_horz_16 scale=0, identity=1, shift=0, suffix=_identity function inv_txfm_add_vert_8x16_neon mov x14, x30 @@ -1487,7 +1457,7 @@ function inv_txfm_add_16x16_neon endfunc .macro def_fn_16x16 txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc 16, 16, 2 .endif @@ -1495,9 +1465,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1 adr x9, inv_txfm_horz_identity_16x8_neon .else adr x9, inv_txfm_horz_16x8_neon - adr x4, inv_\txfm1\()_8x16_neon + adr x4, inv_\txfm1\()_8h_x16_neon .endif - adr x5, inv_\txfm2\()_8x16_neon + adr x5, inv_\txfm2\()_8h_x16_neon mov x13, #\eob_half b inv_txfm_add_16x16_neon endfunc @@ -1659,17 +1629,17 @@ def_fn_416_base def_fn_416_base identity_ .macro def_fn_416 w, h, txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif .if \w == 4 - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_4x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_4h_x\h\()_neon mov w13, #\eob_half .else - adr x4, inv_\txfm1\()_4x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_4h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .endif .ifc \txfm1, identity b inv_txfm_identity_add_\w\()x\h\()_neon @@ -1842,12 +1812,12 @@ def_fn_816_base def_fn_816_base identity_ .macro def_fn_816 w, h, txfm1, txfm2, eob_half -function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1 .ifc \txfm1\()_\txfm2, dct_dct idct_dc \w, \h, 1 .endif - adr x4, inv_\txfm1\()_8x\w\()_neon - adr x5, inv_\txfm2\()_8x\h\()_neon + adr x4, inv_\txfm1\()_8h_x\w\()_neon + adr x5, inv_\txfm2\()_8h_x\h\()_neon .if \w == 8 mov x13, #\eob_half .endif @@ -1881,7 +1851,7 @@ def_fn_816 \w, \h, identity, flipadst, 64 def_fns_816 8, 16 def_fns_816 16, 8 -function inv_dct32_odd_8x16_neon +function inv_dct32_odd_8h_x16_neon, export=1 movrel x16, idct_coeffs, 2*16 ld1 {v0.8h, v1.8h}, [x16] sub x16, x16, #2*16 @@ -2059,7 +2029,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 transpose_8x8h v24, v25, v26, v27, v28, v29, v30, v31, v4, v5 @@ -2089,15 +2059,13 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon scale_input .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23 scale_input .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31 .endif - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon transpose_8x8h v31, v30, v29, v28, v27, v26, v25, v24, v4, v5 transpose_8x8h v23, v22, v21, v20, v19, v18, v17, v16, v4, v5 .macro store2 r0, r1, shift - ld1 {v4.8h}, [x6], #16 - ld1 {v5.8h}, [x6] + ld1 {v4.8h, v5.8h}, [x6] sqsub v7.8h, v4.8h, \r0 sqsub v6.8h, v5.8h, \r1 - sub x6, x6, #16 sqadd v4.8h, v4.8h, \r0 sqadd v5.8h, v5.8h, \r1 rev64 v6.8h, v6.8h @@ -2106,12 +2074,10 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon srshr v5.8h, v5.8h, #\shift srshr v6.8h, v6.8h, #\shift srshr v7.8h, v7.8h, #\shift - st1 {v4.8h}, [x6], #16 ext v6.16b, v6.16b, v6.16b, #8 - st1 {v5.8h}, [x6], #16 + st1 {v4.8h, v5.8h}, [x6], #32 ext v7.16b, v7.16b, v7.16b, #8 - st1 {v6.8h}, [x6], #16 - st1 {v7.8h}, [x6], #16 + st1 {v6.8h, v7.8h}, [x6], #32 .endm store2 v31.8h, v23.8h, \shift @@ -2139,7 +2105,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 st1 {v\i\().8h}, [x7], x8 @@ -2152,7 +2118,7 @@ function inv_txfm_add_vert_dct_8x32_neon .endr sub x7, x7, x8, lsl #4 sub x7, x7, x8, lsr #1 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon neg x9, x8 mov x10, x6 @@ -2216,7 +2182,7 @@ const eob_8x32 .short 43, 107, 171, 256 endconst -function inv_txfm_add_identity_identity_32x32_neon, export=1 +function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_32x32 @@ -2259,7 +2225,7 @@ endfunc .endm .macro def_identity_1632 w, h, wshort, hshort -function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 mov w16, #2896*8 mov w17, #2*(5793-4096)*8 dup v1.4h, w16 @@ -2285,7 +2251,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 .else // 32x16 shift_8_regs sqshl, 1 - identity_8x8 v1.h[1] + identity_8x8 v1.h[1] .endif transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 @@ -2319,12 +2285,13 @@ def_identity_1632 16, 32, _shortside, def_identity_1632 32, 16, , _shortside .macro def_identity_832 w, h -function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 +function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1 movi v0.8h, #0 movrel x13, eob_8x32 mov w8, #2*\h 1: + ldrh w12, [x13], #2 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h ld1 {\i}, [x2] st1 {v0.8h}, [x2], x8 @@ -2337,14 +2304,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + cmp w3, w12 .if \w == 8 load_add_store_8x8 x0, x7, shiftbits=2 .else load_add_store_8x8 x0, x7, shiftbits=3 .endif - ldrh w12, [x13], #2 - cmp w3, w12 b.lt 9f .if \w == 8 sub x2, x2, x8, lsl #3 @@ -2363,7 +2329,7 @@ endfunc def_identity_832 8, 32 def_identity_832 32, 8 -function inv_txfm_add_dct_dct_32x32_neon, export=1 +function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1 idct_dc 32, 32, 2 mov x15, x30 @@ -2411,14 +2377,14 @@ function inv_txfm_add_dct_dct_32x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_16x32_neon, export=1 +function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1 idct_dc 16, 32, 1 mov x15, x30 sub sp, sp, #1024 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, sp, #(\i*16*2) @@ -2460,13 +2426,13 @@ function inv_txfm_add_dct_dct_16x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x16_neon, export=1 +function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1 idct_dc 32, 16, 1 mov x15, x30 sub sp, sp, #1024 - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8 add x6, sp, #(\i*32*2) @@ -2505,7 +2471,7 @@ function inv_txfm_add_dct_dct_32x16_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_8x32_neon, export=1 +function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1 idct_dc 8, 32, 2 mov x15, x30 @@ -2517,18 +2483,17 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 mov x8, #2*32 mov w9, #32 mov x6, sp - mov x7, x2 1: .irp i, 16, 17, 18, 19, 20, 21, 22, 23 - ld1 {v\i\().8h}, [x7] - st1 {v28.8h}, [x7], x8 + ld1 {v\i\().8h}, [x2] + st1 {v28.8h}, [x2], x8 .endr ldrh w12, [x13], #2 + sub x2, x2, x8, lsl #3 sub w9, w9, #8 - sub x7, x7, x8, lsl #3 - add x7, x7, #2*8 + add x2, x2, #2*8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon .irp i, 16, 17, 18, 19, 20, 21, 22, 23 srshr v\i\().8h, v\i\().8h, #2 @@ -2536,10 +2501,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v24, v25 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 cmp w3, w12 -.irp i, 16, 17, 18, 19, 20, 21, 22, 23 - st1 {v\i\().8h}, [x6], #16 -.endr + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64 b.ge 1b cbz w9, 3f @@ -2564,7 +2528,7 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x8_neon, export=1 +function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1 idct_dc 32, 8, 2 mov x15, x30 @@ -2586,7 +2550,7 @@ function inv_txfm_add_dct_dct_32x8_neon, export=1 .endr add w9, w9, #8 - bl inv_dct_8x8_neon + bl inv_dct_8h_x8_neon cmp w9, #32 @@ -2791,7 +2755,7 @@ endfunc .endm .macro def_dct64_func suffix, clear=0, scale=0 -function inv_txfm_dct\suffix\()_8x64_neon +function inv_txfm_dct\suffix\()_8h_x64_neon, export=1 mov x14, x30 mov x6, sp lsl x8, x8, #2 @@ -2804,7 +2768,7 @@ function inv_txfm_dct\suffix\()_8x64_neon add x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct_8x16_neon + bl inv_dct_8h_x16_neon store16 x6 @@ -2817,7 +2781,7 @@ function inv_txfm_dct\suffix\()_8x64_neon sub x7, x7, x8, lsr #1 scale_if \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23 - bl inv_dct32_odd_8x16_neon + bl inv_dct32_odd_8h_x16_neon add x10, x6, #16*15 sub x6, x6, #16*16 @@ -3040,7 +3004,11 @@ endfunc .macro sub_sp space #ifdef _WIN32 -.if \space > 4096 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 sub x16, sp, #4096 ldr xzr, [x16] sub sp, x16, #(\space - 4096) @@ -3050,16 +3018,14 @@ endfunc #else .if \space >= 4096 sub sp, sp, #(\space)/4096*4096 +.endif .if (\space % 4096) != 0 sub sp, sp, #(\space)%4096 .endif -.else - sub sp, sp, #\space -.endif #endif .endm -function inv_txfm_add_dct_dct_64x64_neon, export=1 +function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1 idct_dc 64, 64, 2 mov x15, x30 @@ -3079,7 +3045,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3104,7 +3070,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x7, x5, #(\i*2) mov x8, #64*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3113,7 +3079,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_64x32_neon, export=1 +function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1 idct_dc 64, 32, 1 mov x15, x30 @@ -3133,7 +3099,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1 add x7, x2, #(\i*2) mov x8, #32*2 mov x12, #-1 // shift - bl inv_txfm_dct_clear_scale_8x64_neon + bl inv_txfm_dct_clear_scale_8h_x64_neon add x6, x5, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon .if \i < 24 @@ -3166,7 +3132,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_32x64_neon, export=1 +function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1 idct_dc 32, 64, 1 mov x15, x30 @@ -3207,7 +3173,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1 .irp i, 0, 8, 16, 24 add x7, x5, #(\i*2) mov x8, #32*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr @@ -3216,7 +3182,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_64x16_neon, export=1 +function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1 idct_dc 64, 16, 2 mov x15, x30 @@ -3232,14 +3198,16 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 mov w8, #(16 - \i) cmp w3, w12 b.lt 1f - ldrh w12, [x13], #2 .endif add x7, x2, #(\i*2) mov x8, #16*2 mov x12, #-2 // shift - bl inv_txfm_dct_clear_8x64_neon + bl inv_txfm_dct_clear_8h_x64_neon add x6, x4, #(\i*64*2) bl inv_txfm_horz_dct_64x8_neon +.if \i < 8 + ldrh w12, [x13], #2 +.endif .endr b 3f @@ -3256,7 +3224,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 b.gt 2b 3: - adr x5, inv_dct_8x16_neon + adr x5, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24, 32, 40, 48, 56 add x6, x0, #(\i) add x7, x4, #(\i*2) @@ -3268,7 +3236,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1 br x15 endfunc -function inv_txfm_add_dct_dct_16x64_neon, export=1 +function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1 idct_dc 16, 64, 2 mov x15, x30 @@ -3279,7 +3247,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1 movrel x13, eob_16x32 ldrh w12, [x13], #2 - adr x4, inv_dct_8x16_neon + adr x4, inv_dct_8h_x16_neon .irp i, 0, 8, 16, 24 add x6, x5, #(\i*16*2) .if \i > 0 @@ -3310,7 +3278,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1 .irp i, 0, 8 add x7, x5, #(\i*2) mov x8, #16*2 - bl inv_txfm_dct_8x64_neon + bl inv_txfm_dct_8h_x64_neon add x6, x0, #(\i) bl inv_txfm_add_vert_dct_8x64_neon .endr diff --git a/ffmpeg/JNI/dav1d/src/arm/64/itx16.S b/ffmpeg/JNI/dav1d/src/arm/64/itx16.S new file mode 100644 index 000000000..266f57e36 --- /dev/null +++ b/ffmpeg/JNI/dav1d/src/arm/64/itx16.S @@ -0,0 +1,3526 @@ +/****************************************************************************** + * Copyright © 2018, VideoLAN and dav1d authors + * Copyright © 2020, Martin Storsjo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "src/arm/asm.S" +#include "util.S" + +// The exported functions in this file have got the following signature: +// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob, +// int bitdepth_max); + +// Most of the functions use the following register layout: +// x0-x3 external parameters +// x4 function pointer to first transform +// x5 function pointer to second transform +// x6 output parameter for helper function +// x7 input parameter for helper function +// x8 input stride for helper function +// x9-x12 scratch variables for helper functions +// x13 pointer to list of eob thresholds +// x14 return pointer for helper function +// x15 return pointer for main function + +// The SIMD registers most often use the following layout: +// v0-v1 multiplication coefficients +// v2-v7 scratch registers +// v8-v15 unused +// v16-v31 inputs/outputs of transforms + +const idct_coeffs, align=4 + // idct4 + .int 2896, 2896*8*(1<<16), 1567, 3784 + // idct8 + .int 799, 4017, 3406, 2276 + // idct16 + .int 401, 4076, 3166, 2598 + .int 1931, 3612, 3920, 1189 + // idct32 + .int 201, 4091, 3035, 2751 + .int 1751, 3703, 3857, 1380 + .int 995, 3973, 3513, 2106 + .int 2440, 3290, 4052, 601 +endconst + +const idct64_coeffs, align=4 + .int 101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16) + .int 1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16) + .int 4076, 401, 4017, 799 + + .int 4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16) + .int 3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16) + .int -3166, -2598, -799, -4017 + + .int 501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16) + .int 2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16) + .int 3612, 1931, 2276, 3406 + + .int 4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16) + .int 3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16) + .int -3920, -1189, -3406, -2276 +endconst + +const iadst4_coeffs, align=4 + .int 1321, 3803, 2482, 3344 +endconst + +const iadst8_coeffs, align=4 + .int 4076, 401, 3612, 1931 + .int 2598, 3166, 1189, 3920 + // idct_coeffs + .int 2896, 0, 1567, 3784 +endconst + +const iadst16_coeffs, align=4 + .int 4091, 201, 3973, 995 + .int 3703, 1751, 3290, 2440 + .int 2751, 3035, 2106, 3513 + .int 1380, 3857, 601, 4052 +endconst + +.macro mul_mla d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mla \d\().4s, \s1\().4s, \c1 +.endm + +.macro mul_mls d, s0, s1, c0, c1 + mul \d\().4s, \s0\().4s, \c0 + mls \d\().4s, \s1\().4s, \c1 +.endm + +.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7 + sqrdmulh \r0\sz, \r0\sz, \c + sqrdmulh \r1\sz, \r1\sz, \c + sqrdmulh \r2\sz, \r2\sz, \c + sqrdmulh \r3\sz, \r3\sz, \c +.ifnb \r4 + sqrdmulh \r4\sz, \r4\sz, \c + sqrdmulh \r5\sz, \r5\sz, \c + sqrdmulh \r6\sz, \r6\sz, \c + sqrdmulh \r7\sz, \r7\sz, \c +.endif +.endm + +.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4 +.ifnb \load + ld1 {\load}, [\src], x1 +.endif +.ifnb \shift + srshr \shift, \shift, #\shiftbits +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}, [\dst], x1 +.endif +.endm +.macro load_add_store_8x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src + load_add_store v3.8h, v17.8h, , , , , , \dst, \src + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src + load_add_store v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src + load_add_store v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src + load_add_store v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src + load_add_store v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src + load_add_store v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src + load_add_store v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src + load_add_store v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src + load_add_store v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src + load_add_store , , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src + load_add_store , , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src + load_add_store , , , , v31.8h, v30.8h, v29.8h, \dst, \src + load_add_store , , , , , v31.8h, v30.8h, \dst, \src + load_add_store , , , , , , v31.8h, \dst, \src +.endm +.macro load_add_store_8x8 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits + load_add_store , , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits + load_add_store , , , , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits + load_add_store , , , , , v23.8h, v22.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v23.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store_8x4 dst, src, shiftbits=4 + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store v2.8h, v16.8h, , , , , , \dst, \src, \shiftbits + load_add_store v3.8h, v17.8h, , , , , , \dst, \src, \shiftbits + load_add_store v4.8h, v18.8h, v2.8h, v16.8h, , , , \dst, \src, \shiftbits + load_add_store v5.8h, v19.8h, v3.8h, v17.8h, v16.8h, , , \dst, \src, \shiftbits + load_add_store , , v4.8h, v18.8h, v17.8h, v16.8h, , \dst, \src, \shiftbits + load_add_store , , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits + load_add_store , , , , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits + load_add_store , , , , , v19.8h, v18.8h, \dst, \src, \shiftbits + load_add_store , , , , , , v19.8h, \dst, \src, \shiftbits +.endm +.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src +.ifnb \load + ld1 {\load}[0], [\src], x1 +.endif +.ifnb \inssrc + ins \insdst\().d[1], \inssrc\().d[0] +.endif +.ifnb \shift + srshr \shift, \shift, #4 +.endif +.ifnb \load + ld1 {\load}[1], [\src], x1 +.endif +.ifnb \addsrc + sqadd \adddst, \adddst, \addsrc +.endif +.ifnb \store + st1 {\store}[0], [\dst], x1 +.endif +.ifnb \max + smax \max, \max, v6.8h +.endif +.ifnb \min + smin \min, \min, v7.8h +.endif +.ifnb \store + st1 {\store}[1], [\dst], x1 +.endif +.endm +.macro load_add_store_4x16 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src + load_add_store4 , , , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src + load_add_store4 , , , , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src + load_add_store4 , , , , , , v30.8h, v28.8h, v26.d, \dst, \src + load_add_store4 , , , , , , , v30.8h, v28.d, \dst, \src + load_add_store4 , , , , , , , , v30.d, \dst, \src +.endm +.macro load_add_store_4x8 dst, src + mov \src, \dst + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff + load_add_store4 v0.d, v17, v16, , , , , , , \dst, \src + load_add_store4 v1.d, v19, v18, , , , , , , \dst, \src + load_add_store4 v2.d, v21, v20, v16.8h, , , , , , \dst, \src + load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h, , , , \dst, \src + load_add_store4 , , , v20.8h, v1.8h, v18.8h, v16.8h, , , \dst, \src + load_add_store4 , , , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h, , \dst, \src + load_add_store4 , , , , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src + load_add_store4 , , , , , , v22.8h, v20.8h, v18.d, \dst, \src + load_add_store4 , , , , , , , v22.8h, v20.d, \dst, \src + load_add_store4 , , , , , , , , v22.d, \dst, \src +.endm + +.macro idct_dc w, h, shift + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v0.2s, w16 + sqrdmulh v20.4s, v16.4s, v0.s[0] + str wzr, [x2] +.if (\w == 2*\h) || (2*\w == \h) + sqrdmulh v20.4s, v20.4s, v0.s[0] +.endif +.if \shift > 0 + sqrshrn v16.4h, v20.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift +.else + sqxtn v16.4h, v20.4s + sqxtn2 v16.8h, v20.4s +.endif + sqrdmulh v16.8h, v16.8h, v0.h[1] + srshr v16.8h, v16.8h, #4 + mov w4, #\h + b idct_dc_w\w\()_neon +1: +.endm + +function idct_dc_w4_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + subs w4, w4, #4 + ld1 {v1.d}[1], [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sub x0, x0, x1, lsl #2 + sqadd v1.8h, v1.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + st1 {v0.d}[0], [x0], x1 + smin v1.8h, v1.8h, v31.8h + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w8_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h}, [x0], x1 + subs w4, w4, #4 + ld1 {v1.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v2.8h}, [x0], x1 + sqadd v1.8h, v1.8h, v16.8h + ld1 {v3.8h}, [x0], x1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sub x0, x0, x1, lsl #2 + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + st1 {v0.8h}, [x0], x1 + smin v2.8h, v2.8h, v31.8h + st1 {v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h}, [x0], x1 + st1 {v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w16_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h}, [x0], x1 + subs w4, w4, #2 + ld1 {v2.8h, v3.8h}, [x0], x1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, x1, lsl #1 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + st1 {v0.8h, v1.8h}, [x0], x1 + smin v3.8h, v3.8h, v31.8h + st1 {v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w32_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0] + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + sqadd v1.8h, v1.8h, v16.8h + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +function idct_dc_w64_neon + movi v30.8h, #0 + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x1, x1, #64 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + subs w4, w4, #1 + sqadd v0.8h, v0.8h, v16.8h + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0] + sqadd v1.8h, v1.8h, v16.8h + sub x0, x0, #64 + sqadd v2.8h, v2.8h, v16.8h + sqadd v3.8h, v3.8h, v16.8h + sqadd v4.8h, v4.8h, v16.8h + sqadd v5.8h, v5.8h, v16.8h + sqadd v6.8h, v6.8h, v16.8h + sqadd v7.8h, v7.8h, v16.8h + smax v0.8h, v0.8h, v30.8h + smax v1.8h, v1.8h, v30.8h + smax v2.8h, v2.8h, v30.8h + smax v3.8h, v3.8h, v30.8h + smax v4.8h, v4.8h, v30.8h + smax v5.8h, v5.8h, v30.8h + smax v6.8h, v6.8h, v30.8h + smax v7.8h, v7.8h, v30.8h + smin v0.8h, v0.8h, v31.8h + smin v1.8h, v1.8h, v31.8h + smin v2.8h, v2.8h, v31.8h + smin v3.8h, v3.8h, v31.8h + smin v4.8h, v4.8h, v31.8h + smin v5.8h, v5.8h, v31.8h + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64 + smin v6.8h, v6.8h, v31.8h + smin v7.8h, v7.8h, v31.8h + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1 + b.gt 1b + ret +endfunc + +.macro iwht4 + add v16.4s, v16.4s, v17.4s + sub v21.4s, v18.4s, v19.4s + sub v20.4s, v16.4s, v21.4s + sshr v20.4s, v20.4s, #1 + sub v18.4s, v20.4s, v17.4s + sub v17.4s, v20.4s, v19.4s + add v19.4s, v21.4s, v18.4s + sub v16.4s, v16.4s, v17.4s +.endm + +.macro idct_4 r0, r1, r2, r3 + mul_mla v6, \r1, \r3, v0.s[3], v0.s[2] + mul_mls v4, \r1, \r3, v0.s[2], v0.s[3] + mul_mla v2, \r0, \r2, v0.s[0], v0.s[0] + mul_mls v3, \r0, \r2, v0.s[0], v0.s[0] + srshr v6.4s, v6.4s, #12 + srshr v7.4s, v4.4s, #12 + srshr v2.4s, v2.4s, #12 + srshr v3.4s, v3.4s, #12 + sqadd \r0\().4s, v2.4s, v6.4s + sqsub \r3\().4s, v2.4s, v6.4s + sqadd \r1\().4s, v3.4s, v7.4s + sqsub \r2\().4s, v3.4s, v7.4s +.endm + +function inv_dct_4s_x4_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] + idct_4 v16, v17, v18, v19 + ret +endfunc + +.macro iadst_4x4 o0, o1, o2, o3 + movrel x16, iadst4_coeffs + ld1 {v0.4s}, [x16] + + sub v3.4s, v16.4s, v18.4s + mul v4.4s, v16.4s, v0.s[0] + mla v4.4s, v18.4s, v0.s[1] + mla v4.4s, v19.4s, v0.s[2] + mul v7.4s, v17.4s, v0.s[3] + add v3.4s, v3.4s, v19.4s + mul v5.4s, v16.4s, v0.s[2] + mls v5.4s, v18.4s, v0.s[0] + mls v5.4s, v19.4s, v0.s[1] + + add \o3\().4s, v4.4s, v5.4s + mul \o2\().4s, v3.4s, v0.s[3] + add \o0\().4s, v4.4s, v7.4s + add \o1\().4s, v5.4s, v7.4s + sub \o3\().4s, \o3\().4s, v7.4s + + srshr \o0\().4s, \o0\().4s, #12 + srshr \o2\().4s, \o2\().4s, #12 + srshr \o1\().4s, \o1\().4s, #12 + srshr \o3\().4s, \o3\().4s, #12 +.endm + +function inv_adst_4s_x4_neon + iadst_4x4 v16, v17, v18, v19 + ret +endfunc + +function inv_flipadst_4s_x4_neon + iadst_4x4 v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x4_neon + movz w16, #(5793-4096)*8, lsl #16 + dup v0.2s, w16 + sqrdmulh v4.4s, v16.4s, v0.s[0] + sqrdmulh v5.4s, v17.4s, v0.s[0] + sqrdmulh v6.4s, v18.4s, v0.s[0] + sqrdmulh v7.4s, v19.4s, v0.s[0] + sqadd v16.4s, v16.4s, v4.4s + sqadd v17.4s, v17.4s, v5.4s + sqadd v18.4s, v18.4s, v6.4s + sqadd v19.4s, v19.4s, v7.4s + ret +endfunc + +function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1 + mov x15, x30 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + sshr v16.4s, v16.4s, #2 + sshr v17.4s, v17.4s, #2 + sshr v18.4s, v18.4s, #2 + sshr v19.4s, v19.4s, #2 + + iwht4 + + st1 {v30.4s, v31.4s}, [x2], #32 + transpose_4x4s v16, v17, v18, v19, v20, v21, v22, v23 + + iwht4 + + ld1 {v0.d}[0], [x0], x1 + sqxtn v16.4h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqxtn2 v16.8h, v17.4s + ld1 {v1.d}[0], [x0], x1 + sqxtn v18.4h, v18.4s + ld1 {v1.d}[1], [x0], x1 + sqxtn2 v18.8h, v19.4s + + b L(itx_4x4_end) +endfunc + +function inv_txfm_add_4x4_neon + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v30.4s, v31.4s}, [x2], #32 + + blr x4 + + st1 {v30.4s, v31.4s}, [x2], #32 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v20, v21, v22, v23 + + blr x5 + + ld1 {v0.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + ld1 {v1.d}[0], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + srshr v16.8h, v16.8h, #4 + srshr v18.8h, v18.8h, #4 + +L(itx_4x4_end): + mvni v31.8h, #0xfc, lsl #8 // 0x3ff + sub x0, x0, x1, lsl #2 + sqadd v16.8h, v16.8h, v0.8h + sqadd v18.8h, v18.8h, v1.8h + smax v16.8h, v16.8h, v30.8h + smax v18.8h, v18.8h, v30.8h + smin v16.8h, v16.8h, v31.8h + st1 {v16.d}[0], [x0], x1 + smin v18.8h, v18.8h, v31.8h + st1 {v16.d}[1], [x0], x1 + st1 {v18.d}[0], [x0], x1 + st1 {v18.d}[1], [x0], x1 + + br x15 +endfunc + +.macro def_fn_4x4 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + cbnz w3, 1f + movz w16, #2896*8, lsl #16 + ld1r {v16.4s}, [x2] + dup v4.2s, w16 + str wzr, [x2] + sqrdmulh v16.4s, v16.4s, v4.s[0] + ld1 {v0.d}[0], [x0], x1 + sqxtn v20.4h, v16.4s + sqxtn2 v20.8h, v16.4s + ld1 {v0.d}[1], [x0], x1 + sqrdmulh v20.8h, v20.8h, v4.h[1] + ld1 {v1.d}[0], [x0], x1 + srshr v16.8h, v20.8h, #4 + ld1 {v1.d}[1], [x0], x1 + srshr v18.8h, v20.8h, #4 + movi v30.8h, #0 + b L(itx_4x4_end) +1: +.endif + adr x4, inv_\txfm1\()_4s_x4_neon + movrel x5, X(inv_\txfm2\()_4h_x4_neon) + b inv_txfm_add_4x4_neon +endfunc +.endm + +def_fn_4x4 dct, dct +def_fn_4x4 identity, identity +def_fn_4x4 dct, adst +def_fn_4x4 dct, flipadst +def_fn_4x4 dct, identity +def_fn_4x4 adst, dct +def_fn_4x4 adst, adst +def_fn_4x4 adst, flipadst +def_fn_4x4 flipadst, dct +def_fn_4x4 flipadst, adst +def_fn_4x4 flipadst, flipadst +def_fn_4x4 identity, dct + +def_fn_4x4 adst, identity +def_fn_4x4 flipadst, identity +def_fn_4x4 identity, adst +def_fn_4x4 identity, flipadst + +.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7 + idct_4 \r0, \r2, \r4, \r6 + + mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a + mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a + mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a + mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a + srshr \r1\().4s, v2.4s, #12 // t4a + srshr \r7\().4s, v4.4s, #12 // t7a + srshr \r3\().4s, v6.4s, #12 // t5a + srshr \r5\().4s, v7.4s, #12 // taa + + sqadd v2.4s, \r1\().4s, \r3\().4s // t4 + sqsub \r1\().4s, \r1\().4s, \r3\().4s // t5a + sqadd v3.4s, \r7\().4s, \r5\().4s // t7 + sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a + + mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5 + mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6 + srshr v4.4s, v4.4s, #12 // t5 + srshr v5.4s, v6.4s, #12 // t6 + + sqsub \r7\().4s, \r0\().4s, v3.4s // out7 + sqadd \r0\().4s, \r0\().4s, v3.4s // out0 + sqadd \r1\().4s, \r2\().4s, v5.4s // out1 + sqsub v6.4s, \r2\().4s, v5.4s // out6 + sqadd \r2\().4s, \r4\().4s, v4.4s // out2 + sqsub \r5\().4s, \r4\().4s, v4.4s // out5 + sqadd \r3\().4s, \r6\().4s, v2.4s // out3 + sqsub \r4\().4s, \r6\().4s, v2.4s // out4 + mov \r6\().16b, v6.16b // out6 +.endm + +function inv_dct_4s_x8_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16] + idct_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7 + movrel x16, iadst8_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v23, v16, v0.s[0], v0.s[1] + mul_mls v4, v23, v16, v0.s[1], v0.s[0] + mul_mla v6, v21, v18, v0.s[2], v0.s[3] + srshr v16.4s, v2.4s, #12 // t0a + srshr v23.4s, v4.4s, #12 // t1a + mul_mls v2, v21, v18, v0.s[3], v0.s[2] + mul_mla v4, v19, v20, v1.s[0], v1.s[1] + srshr v18.4s, v6.4s, #12 // t2a + srshr v21.4s, v2.4s, #12 // t3a + mul_mls v6, v19, v20, v1.s[1], v1.s[0] + mul_mla v2, v17, v22, v1.s[2], v1.s[3] + srshr v20.4s, v4.4s, #12 // t4a + srshr v19.4s, v6.4s, #12 // t5a + mul_mls v4, v17, v22, v1.s[3], v1.s[2] + srshr v22.4s, v2.4s, #12 // t6a + srshr v17.4s, v4.4s, #12 // t7a + + ld1 {v0.4s}, [x16] + + sqadd v2.4s, v16.4s, v20.4s // t0 + sqsub v3.4s, v16.4s, v20.4s // t4 + sqadd v4.4s, v23.4s, v19.4s // t1 + sqsub v5.4s, v23.4s, v19.4s // t5 + sqadd v6.4s, v18.4s, v22.4s // t2 + sqsub v7.4s, v18.4s, v22.4s // t6 + sqadd v18.4s, v21.4s, v17.4s // t3 + sqsub v19.4s, v21.4s, v17.4s // t7 + + mul_mla v16, v3, v5, v0.s[3], v0.s[2] + mul_mls v20, v3, v5, v0.s[2], v0.s[3] + mul_mls v22, v19, v7, v0.s[3], v0.s[2] + + srshr v3.4s, v16.4s, #12 // t4a + srshr v5.4s, v20.4s, #12 // t5a + + mul_mla v16, v19, v7, v0.s[2], v0.s[3] + + srshr v7.4s, v22.4s, #12 // t6a + srshr v19.4s, v16.4s, #12 // t7a + + sqadd \o0\().4s, v2.4s, v6.4s // out0 + sqsub v2.4s, v2.4s, v6.4s // t2 + sqadd \o7\().4s, v4.4s, v18.4s // out7 + sqsub v4.4s, v4.4s, v18.4s // t3 + sqneg \o7\().4s, \o7\().4s // out7 + + sqadd \o1\().4s, v3.4s, v7.4s // out1 + sqsub v3.4s, v3.4s, v7.4s // t6 + sqadd \o6\().4s, v5.4s, v19.4s // out6 + sqsub v5.4s, v5.4s, v19.4s // t7 + sqneg \o1\().4s, \o1\().4s // out1 + + mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20) + mul_mls v6, v2, v4, v0.s[0], v0.s[0] // -> out4 (v20 or v19) + mul_mls v20, v3, v5, v0.s[0], v0.s[0] // -> out5 (v21 or v18) + srshr v2.4s, v18.4s, #12 // out3 + mul_mla v18, v3, v5, v0.s[0], v0.s[0] // -> out2 (v18 or v21) + srshr v3.4s, v20.4s, #12 // out5 + srshr \o2\().4s, v18.4s, #12 // out2 (v18 or v21) + srshr \o4\().4s, v6.4s, #12 // out4 (v20 or v19) + + sqneg \o3\().4s, v2.4s // out3 + sqneg \o5\().4s, v3.4s // out5 +.endm + +function inv_adst_4s_x8_neon + iadst_8 v16, v17, v18, v19, v20, v21, v22, v23 + ret +endfunc + +function inv_flipadst_4s_x8_neon + iadst_8 v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x8_neon + sqshl v16.4s, v16.4s, #1 + sqshl v17.4s, v17.4s, #1 + sqshl v18.4s, v18.4s, #1 + sqshl v19.4s, v19.4s, #1 + sqshl v20.4s, v20.4s, #1 + sqshl v21.4s, v21.4s, #1 + sqshl v22.4s, v22.4s, #1 + sqshl v23.4s, v23.4s, #1 + ret +endfunc + +function inv_txfm_add_8x8_neon + movi v31.4s, #0 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + transpose_4x8h v16, v17, v18, v19, v20, v21, v22, v23 + + mov v20.16b, v24.16b + mov v21.16b, v25.16b + mov v22.16b, v26.16b + mov v23.16b, v27.16b + + blr x5 + + load_add_store_8x8 x0, x7 + br x15 +endfunc + +.macro def_fn_8x8 txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 8, 8, 1 +.endif + movrel x5, X(inv_\txfm2\()_8h_x8_neon) + mov w13, #\eob_half + adr x4, inv_\txfm1\()_4s_x8_neon + b inv_txfm_add_8x8_neon +endfunc +.endm + +def_fn_8x8 dct, dct, 10 +def_fn_8x8 identity, identity, 10 +def_fn_8x8 dct, adst, 10 +def_fn_8x8 dct, flipadst, 10 +def_fn_8x8 dct, identity, 4 +def_fn_8x8 adst, dct, 10 +def_fn_8x8 adst, adst, 10 +def_fn_8x8 adst, flipadst, 10 +def_fn_8x8 flipadst, dct, 10 +def_fn_8x8 flipadst, adst, 10 +def_fn_8x8 flipadst, flipadst, 10 +def_fn_8x8 identity, dct, 4 +def_fn_8x8 adst, identity, 4 +def_fn_8x8 flipadst, identity, 4 +def_fn_8x8 identity, adst, 4 +def_fn_8x8 identity, flipadst, 4 + +function inv_txfm_add_8x4_neon + movi v28.4s, #0 + movi v29.4s, #0 + movi v30.4s, #0 + movi v31.4s, #0 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + ld1 {v20.4s,v21.4s,v22.4s,v23.4s}, [x2] + st1 {v28.4s,v29.4s,v30.4s,v31.4s}, [x2] + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + blr x4 + + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + sqxtn v20.4h, v20.4s + sqxtn v21.4h, v21.4s + sqxtn v22.4h, v22.4s + sqxtn v23.4h, v23.4s + + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + ins v16.d[1], v20.d[0] + ins v17.d[1], v21.d[0] + ins v18.d[1], v22.d[0] + ins v19.d[1], v23.d[0] + + blr x5 + + load_add_store_8x4 x0, x7 + br x15 +endfunc + +function inv_txfm_add_4x8_neon + movz w16, #2896*8, lsl #16 + movi v31.4s, #0 + dup v30.2s, w16 + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v31.4s}, [x6], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v20.4h, v16.4s + sqxtn v21.4h, v17.4s + sqxtn v22.4h, v18.4s + sqxtn v23.4h, v19.4s + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f + +1: +.irp i, v20, v21, v22, v23 + movi \i\().4h, #0 +.endr + +2: + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v31.4s}, [x2], x11 +.endr + scale_input .4s, v30.s[0], v16, v17, v18, v19 + blr x4 + sqxtn v16.4h, v16.4s + sqxtn v17.4h, v17.4s + sqxtn v18.4h, v18.4s + sqxtn v19.4h, v19.4s + transpose_4x4h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x8 x0, x7 + br x15 +endfunc + +.macro def_fn_48 w, h, txfm1, txfm2, eob_half +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 + mov x15, x30 + +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 0 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon +.if \w == 4 + mov w13, #\eob_half +.endif + movrel x5, X(inv_\txfm2\()_\w\()h_x\h\()_neon) + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_48 w, h +def_fn_48 \w, \h, dct, dct, 13 +def_fn_48 \w, \h, identity, identity, 13 +def_fn_48 \w, \h, dct, adst, 13 +def_fn_48 \w, \h, dct, flipadst, 13 +def_fn_48 \w, \h, dct, identity, 4 +def_fn_48 \w, \h, adst, dct, 13 +def_fn_48 \w, \h, adst, adst, 13 +def_fn_48 \w, \h, adst, flipadst, 13 +def_fn_48 \w, \h, flipadst, dct, 13 +def_fn_48 \w, \h, flipadst, adst, 13 +def_fn_48 \w, \h, flipadst, flipadst, 13 +def_fn_48 \w, \h, identity, dct, 16 +def_fn_48 \w, \h, adst, identity, 4 +def_fn_48 \w, \h, flipadst, identity, 4 +def_fn_48 \w, \h, identity, adst, 16 +def_fn_48 \w, \h, identity, flipadst, 16 +.endm + +def_fns_48 4, 8 +def_fns_48 8, 4 + + +function inv_dct_4s_x16_neon + movrel x16, idct_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + idct_8 v16, v18, v20, v22, v24, v26, v28, v30 + + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #32 + + mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a + mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a + mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a + srshr v17.4s, v2.4s, #12 // t8a + srshr v31.4s, v4.4s, #12 // t15a + mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a + mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a + srshr v23.4s, v6.4s, #12 // t9a + srshr v25.4s, v2.4s, #12 // t14a + mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a + mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a + srshr v21.4s, v4.4s, #12 // t10a + srshr v27.4s, v6.4s, #12 // t13a + mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a + srshr v19.4s, v2.4s, #12 // t11a + srshr v29.4s, v4.4s, #12 // t12a + + ld1 {v0.4s}, [x16] + + sqsub v2.4s, v17.4s, v23.4s // t9 + sqadd v17.4s, v17.4s, v23.4s // t8 + sqsub v3.4s, v31.4s, v25.4s // t14 + sqadd v31.4s, v31.4s, v25.4s // t15 + sqsub v23.4s, v19.4s, v21.4s // t10 + sqadd v19.4s, v19.4s, v21.4s // t11 + sqadd v25.4s, v29.4s, v27.4s // t12 + sqsub v29.4s, v29.4s, v27.4s // t13 + + mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a + mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a + srshr v21.4s, v4.4s, #12 // t9a + srshr v27.4s, v6.4s, #12 // t14a + + mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a + mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a + srshr v29.4s, v4.4s, #12 // t13a + neg v6.4s, v6.4s + srshr v23.4s, v6.4s, #12 // t10a + + sqsub v2.4s, v17.4s, v19.4s // t11a + sqadd v17.4s, v17.4s, v19.4s // t8a + sqsub v3.4s, v31.4s, v25.4s // t12a + sqadd v31.4s, v31.4s, v25.4s // t15a + sqadd v19.4s, v21.4s, v23.4s // t9 + sqsub v21.4s, v21.4s, v23.4s // t10 + sqsub v25.4s, v27.4s, v29.4s // t13 + sqadd v27.4s, v27.4s, v29.4s // t14 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11 + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12 + mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a + + srshr v4.4s, v4.4s, #12 // t11 + srshr v5.4s, v6.4s, #12 // t12 + mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t10a + srshr v2.4s, v2.4s, #12 // t10a + srshr v3.4s, v6.4s, #12 // t13a + + sqadd v6.4s, v16.4s, v31.4s // out0 + sqsub v31.4s, v16.4s, v31.4s // out15 + mov v16.16b, v6.16b + sqadd v23.4s, v30.4s, v17.4s // out7 + sqsub v7.4s, v30.4s, v17.4s // out8 + sqadd v17.4s, v18.4s, v27.4s // out1 + sqsub v30.4s, v18.4s, v27.4s // out14 + sqadd v18.4s, v20.4s, v3.4s // out2 + sqsub v29.4s, v20.4s, v3.4s // out13 + sqadd v3.4s, v28.4s, v19.4s // out6 + sqsub v25.4s, v28.4s, v19.4s // out9 + sqadd v19.4s, v22.4s, v5.4s // out3 + sqsub v28.4s, v22.4s, v5.4s // out12 + sqadd v20.4s, v24.4s, v4.4s // out4 + sqsub v27.4s, v24.4s, v4.4s // out11 + sqadd v21.4s, v26.4s, v2.4s // out5 + sqsub v26.4s, v26.4s, v2.4s // out10 + mov v24.16b, v7.16b + mov v22.16b, v3.16b + + ret +endfunc + +.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15 + movrel x16, iadst16_coeffs + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mla v2, v31, v16, v0.s[0], v0.s[1] // -> t0 + mul_mls v4, v31, v16, v0.s[1], v0.s[0] // -> t1 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t2 + srshr v16.4s, v2.4s, #12 // t0 + srshr v31.4s, v4.4s, #12 // t1 + mul_mls v2, v29, v18, v0.s[3], v0.s[2] // -> t3 + mul_mla v4, v27, v20, v1.s[0], v1.s[1] // -> t4 + srshr v18.4s, v6.4s, #12 // t2 + srshr v29.4s, v2.4s, #12 // t3 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t5 + mul_mla v2, v25, v22, v1.s[2], v1.s[3] // -> t6 + srshr v20.4s, v4.4s, #12 // t4 + srshr v27.4s, v6.4s, #12 // t5 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t7 + ld1 {v0.4s, v1.4s}, [x16] + movrel x16, idct_coeffs + mul_mla v6, v23, v24, v0.s[0], v0.s[1] // -> t8 + srshr v22.4s, v2.4s, #12 // t6 + srshr v25.4s, v4.4s, #12 // t7 + mul_mls v2, v23, v24, v0.s[1], v0.s[0] // -> t9 + mul_mla v4, v21, v26, v0.s[2], v0.s[3] // -> t10 + srshr v23.4s, v6.4s, #12 // t8 + srshr v24.4s, v2.4s, #12 // t9 + mul_mls v6, v21, v26, v0.s[3], v0.s[2] // -> t11 + mul_mla v2, v19, v28, v1.s[0], v1.s[1] // -> t12 + srshr v21.4s, v4.4s, #12 // t10 + srshr v26.4s, v6.4s, #12 // t11 + mul_mls v4, v19, v28, v1.s[1], v1.s[0] // -> t13 + mul_mla v6, v17, v30, v1.s[2], v1.s[3] // -> t14 + srshr v19.4s, v2.4s, #12 // t12 + srshr v28.4s, v4.4s, #12 // t13 + mul_mls v2, v17, v30, v1.s[3], v1.s[2] // -> t15 + srshr v17.4s, v6.4s, #12 // t14 + srshr v30.4s, v2.4s, #12 // t15 + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v23.4s // t8a + sqadd v16.4s, v16.4s, v23.4s // t0a + sqsub v3.4s, v31.4s, v24.4s // t9a + sqadd v31.4s, v31.4s, v24.4s // t1a + sqadd v23.4s, v18.4s, v21.4s // t2a + sqsub v18.4s, v18.4s, v21.4s // t10a + sqadd v24.4s, v29.4s, v26.4s // t3a + sqsub v29.4s, v29.4s, v26.4s // t11a + sqadd v21.4s, v20.4s, v19.4s // t4a + sqsub v20.4s, v20.4s, v19.4s // t12a + sqadd v26.4s, v27.4s, v28.4s // t5a + sqsub v27.4s, v27.4s, v28.4s // t13a + sqadd v19.4s, v22.4s, v17.4s // t6a + sqsub v22.4s, v22.4s, v17.4s // t14a + sqadd v28.4s, v25.4s, v30.4s // t7a + sqsub v25.4s, v25.4s, v30.4s // t15a + + mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8 + mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9 + mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10 + srshr v17.4s, v4.4s, #12 // t8 + srshr v30.4s, v6.4s, #12 // t9 + mul_mls v4, v18, v29, v1.s[2], v1.s[3] // -> t11 + mul_mls v6, v27, v20, v1.s[1], v1.s[0] // -> t12 + srshr v18.4s, v2.4s, #12 // t10 + srshr v29.4s, v4.4s, #12 // t11 + mul_mla v2, v27, v20, v1.s[0], v1.s[1] // -> t13 + mul_mls v4, v25, v22, v1.s[3], v1.s[2] // -> t14 + srshr v27.4s, v6.4s, #12 // t12 + srshr v20.4s, v2.4s, #12 // t13 + mul_mla v6, v25, v22, v1.s[2], v1.s[3] // -> t15 + srshr v25.4s, v4.4s, #12 // t14 + srshr v22.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t4 + sqadd v16.4s, v16.4s, v21.4s // t0 + sqsub v3.4s, v31.4s, v26.4s // t5 + sqadd v31.4s, v31.4s, v26.4s // t1 + sqadd v21.4s, v23.4s, v19.4s // t2 + sqsub v23.4s, v23.4s, v19.4s // t6 + sqadd v26.4s, v24.4s, v28.4s // t3 + sqsub v24.4s, v24.4s, v28.4s // t7 + sqadd v19.4s, v17.4s, v27.4s // t8a + sqsub v17.4s, v17.4s, v27.4s // t12a + sqadd v28.4s, v30.4s, v20.4s // t9a + sqsub v30.4s, v30.4s, v20.4s // t13a + sqadd v27.4s, v18.4s, v25.4s // t10a + sqsub v18.4s, v18.4s, v25.4s // t14a + sqadd v20.4s, v29.4s, v22.4s // t11a + sqsub v29.4s, v29.4s, v22.4s // t15a + + mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a + mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a + mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a + srshr v22.4s, v4.4s, #12 // t4a + srshr v25.4s, v6.4s, #12 // t5a + mul_mla v4, v24, v23, v0.s[2], v0.s[3] // -> t7a + mul_mla v6, v17, v30, v0.s[3], v0.s[2] // -> t12 + srshr v24.4s, v2.4s, #12 // t6a + srshr v23.4s, v4.4s, #12 // t7a + mul_mls v2, v17, v30, v0.s[2], v0.s[3] // -> t13 + mul_mls v4, v29, v18, v0.s[3], v0.s[2] // -> t14 + srshr v17.4s, v6.4s, #12 // t12 + mul_mla v6, v29, v18, v0.s[2], v0.s[3] // -> t15 + srshr v29.4s, v2.4s, #12 // t13 + srshr v30.4s, v4.4s, #12 // t14 + srshr v18.4s, v6.4s, #12 // t15 + + sqsub v2.4s, v16.4s, v21.4s // t2a +.ifc \o0, v16 + sqadd \o0\().4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 +.else + sqadd v4.4s, v16.4s, v21.4s // out0 + sqsub v21.4s, v31.4s, v26.4s // t3a + sqadd \o15\().4s, v31.4s, v26.4s // out15 + mov \o0\().16b, v4.16b +.endif + sqneg \o15\().4s, \o15\().4s // out15 + + sqsub v3.4s, v29.4s, v18.4s // t15a + sqadd \o13\().4s, v29.4s, v18.4s // out13 + sqadd \o2\().4s, v17.4s, v30.4s // out2 + sqsub v26.4s, v17.4s, v30.4s // t14a + sqneg \o13\().4s, \o13\().4s // out13 + + sqadd \o1\().4s, v19.4s, v27.4s // out1 + sqsub v27.4s, v19.4s, v27.4s // t10 + sqadd \o14\().4s, v28.4s, v20.4s // out14 + sqsub v20.4s, v28.4s, v20.4s // t11 + sqneg \o1\().4s, \o1\().4s // out1 + + sqadd \o3\().4s, v22.4s, v24.4s // out3 + sqsub v22.4s, v22.4s, v24.4s // t6 + sqadd \o12\().4s, v25.4s, v23.4s // out12 + sqsub v23.4s, v25.4s, v23.4s // t7 + sqneg \o3\().4s, \o3\().4s // out3 + + mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23) + mul_mla v4, v2, v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24) + mul_mla v6, v26, v3, v0.s[0], v0.s[0] // -> out5 (v21 or v26) + + srshr v24.4s, v24.4s, #12 // out8 + srshr v4.4s, v4.4s, #12 // out7 + srshr v5.4s, v6.4s, #12 // out5 + mul_mls v6, v26, v3, v0.s[0], v0.s[0] // -> out10 (v26 or v21) + mul_mla v2, v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27) + srshr v26.4s, v6.4s, #12 // out10 + + mul_mls v6, v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20) + mul_mla v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25) + mul_mls v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22) + + srshr \o4\().4s, v2.4s, #12 // out4 + srshr v6.4s, v6.4s, #12 // out11 + srshr v7.4s, v21.4s, #12 // out9 + srshr \o6\().4s, v22.4s, #12 // out6 + +.ifc \o8, v23 + mov \o8\().16b, v24.16b + mov \o10\().16b, v26.16b +.endif + + sqneg \o7\().4s, v4.4s // out7 + sqneg \o5\().4s, v5.4s // out5 + sqneg \o11\().4s, v6.4s // out11 + sqneg \o9\().4s, v7.4s // out9 +.endm + +function inv_adst_4s_x16_neon + iadst_16 v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 + ret +endfunc + +function inv_flipadst_4s_x16_neon + iadst_16 v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16 + ret +endfunc + +function inv_identity_4s_x16_neon + movz w16, #2*(5793-4096)*8, lsl #16 + dup v0.2s, w16 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + sqrdmulh v2.4s, v\i\().4s, v0.s[0] + sqadd v\i\().4s, v\i\().4s, v\i\().4s + sqadd v\i\().4s, v\i\().4s, v2.4s +.endr + ret +endfunc + +.macro identity_4x16_shift1 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + srshr v3.4s, v3.4s, #1 + sqadd \i, \i, v3.4s +.endr +.endm + +.macro identity_4x16 c +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + sqrdmulh v3.4s, \i, \c + sqadd \i, \i, \i + sqadd \i, \i, v3.4s +.endr +.endm + +.macro def_horz_16 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_16x4_neon + mov x14, x30 + movi v7.4s, #0 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + blr x4 + sqrshrn v16.4h, v16.4s, #\shift + sqrshrn v17.4h, v17.4s, #\shift + sqrshrn v18.4h, v18.4s, #\shift + sqrshrn v19.4h, v19.4s, #\shift + sqrshrn2 v16.8h, v20.4s, #\shift + sqrshrn2 v17.8h, v21.4s, #\shift + sqrshrn2 v18.8h, v22.4s, #\shift + sqrshrn2 v19.8h, v23.4s, #\shift + sqrshrn v20.4h, v24.4s, #\shift + sqrshrn v21.4h, v25.4s, #\shift + sqrshrn v22.4h, v26.4s, #\shift + sqrshrn v23.4h, v27.4s, #\shift + sqrshrn2 v20.8h, v28.4s, #\shift + sqrshrn2 v21.8h, v29.4s, #\shift + sqrshrn2 v22.8h, v30.4s, #\shift + sqrshrn2 v23.8h, v31.4s, #\shift + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + transpose_4x8h v20, v21, v22, v23, v4, v5, v6, v7 + +.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h + st1 {\i}, [x6], #16 +.endr + + br x14 +endfunc +.endm + +def_horz_16 scale=0, shift=2 +def_horz_16 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_8x16_neon + mov x14, x30 +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + blr x5 + load_add_store_8x16 x6, x7 + br x14 +endfunc + +function inv_txfm_add_16x16_neon + mov x15, x30 + sub sp, sp, #512 + ldrh w12, [x13], #2 +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*16*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #512 + br x15 +endfunc + +const eob_16x16 + .short 10, 36, 78, 256 +endconst + +const eob_16x16_identity + .short 4, 8, 12, 256 +endconst + +.macro def_fn_16x16 txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc 16, 16, 2 +.endif + adr x4, inv_\txfm1\()_4s_x16_neon + movrel x5, X(inv_\txfm2\()_8h_x16_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_16x16 +.else + movrel x13, eob_16x16_identity +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_16x16_identity +.else + movrel x13, eob_16x16 +.endif +.endif + b inv_txfm_add_16x16_neon +endfunc +.endm + +def_fn_16x16 dct, dct +def_fn_16x16 identity, identity +def_fn_16x16 dct, adst +def_fn_16x16 dct, flipadst +def_fn_16x16 dct, identity +def_fn_16x16 adst, dct +def_fn_16x16 adst, adst +def_fn_16x16 adst, flipadst +def_fn_16x16 flipadst, dct +def_fn_16x16 flipadst, adst +def_fn_16x16 flipadst, flipadst +def_fn_16x16 identity, dct + +function inv_txfm_add_16x4_neon + mov x15, x30 + movi v4.4s, #0 + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], #16 +.endr + + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + mov x6, x0 + load_add_store_8x4 x6, x7 + + sqrshrn v16.4h, v24.4s, #1 + sqrshrn v17.4h, v25.4s, #1 + sqrshrn v18.4h, v26.4s, #1 + sqrshrn v19.4h, v27.4s, #1 + sqrshrn2 v16.8h, v28.4s, #1 + sqrshrn2 v17.8h, v29.4s, #1 + sqrshrn2 v18.8h, v30.4s, #1 + sqrshrn2 v19.8h, v31.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + blr x5 + add x6, x0, #16 + load_add_store_8x4 x6, x7 + + br x15 +endfunc + +function inv_txfm_add_4x16_neon + ldrh w12, [x13, #4] + mov x15, x30 + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v28.4h, v16.4s, #1 + rshrn v29.4h, v17.4s, #1 + rshrn v30.4h, v18.4s, #1 + rshrn v31.4h, v19.4s, #1 + transpose_4x4h v28, v29, v30, v31, v4, v5, v6, v7 + + b 2f +1: +.irp i, v28.4h, v29.4h, v30.4h, v31.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v24.4h, v16.4s, #1 + rshrn v25.4h, v17.4s, #1 + rshrn v26.4h, v18.4s, #1 + rshrn v27.4h, v19.4s, #1 + transpose_4x4h v24, v25, v26, v27, v4, v5, v6, v7 + + b 2f +1: +.irp i, v24.4h, v25.4h, v26.4h, v27.4h + movi \i, #0 +.endr +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x6] + st1 {v2.4s}, [x6], x11 +.endr + blr x4 + rshrn v20.4h, v16.4s, #1 + rshrn v21.4h, v17.4s, #1 + rshrn v22.4h, v18.4s, #1 + rshrn v23.4h, v19.4s, #1 + transpose_4x4h v20, v21, v22, v23, v4, v5, v6, v7 + + b 2f +1: +.irp i, v20.4h, v21.4h, v22.4h, v23.4h + movi \i, #0 +.endr +2: + + movi v2.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s + ld1 {\i}, [x2] + st1 {v2.4s}, [x2], x11 +.endr + blr x4 + rshrn v16.4h, v16.4s, #1 + rshrn v17.4h, v17.4s, #1 + rshrn v18.4h, v18.4s, #1 + rshrn v19.4h, v19.4s, #1 + transpose_4x8h v16, v17, v18, v19, v4, v5, v6, v7 + + blr x5 + + load_add_store_4x16 x0, x6 + + br x15 +endfunc + +const eob_4x16 + .short 13, 29, 45, 64 +endconst + +const eob_4x16_identity1 + .short 16, 32, 48, 64 +endconst + +const eob_4x16_identity2 + .short 4, 8, 12, 64 +endconst + +.macro def_fn_416 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif +.if \w == 4 + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_4h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_4x16 +.else + movrel x13, eob_4x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_4x16_identity2 +.else + movrel x13, eob_4x16 +.endif +.endif +.else + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_416 w, h +def_fn_416 \w, \h, dct, dct +def_fn_416 \w, \h, identity, identity +def_fn_416 \w, \h, dct, adst +def_fn_416 \w, \h, dct, flipadst +def_fn_416 \w, \h, dct, identity +def_fn_416 \w, \h, adst, dct +def_fn_416 \w, \h, adst, adst +def_fn_416 \w, \h, adst, flipadst +def_fn_416 \w, \h, flipadst, dct +def_fn_416 \w, \h, flipadst, adst +def_fn_416 \w, \h, flipadst, flipadst +def_fn_416 \w, \h, identity, dct +def_fn_416 \w, \h, adst, identity +def_fn_416 \w, \h, flipadst, identity +def_fn_416 \w, \h, identity, adst +def_fn_416 \w, \h, identity, flipadst +.endm + +def_fns_416 4, 16 +def_fns_416 16, 4 + + +function inv_txfm_add_16x8_neon + mov x15, x30 + stp d8, d9, [sp, #-0x40]! + stp d10, d11, [sp, #0x10] + stp d12, d13, [sp, #0x20] + stp d14, d15, [sp, #0x30] + + cmp w3, w13 + mov x11, #32 + b.lt 1f + + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + add x6, x2, #16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + sqrshrn v12.4h, v24.4s, #1 + sqrshrn v13.4h, v25.4s, #1 + sqrshrn v14.4h, v26.4s, #1 + sqrshrn v15.4h, v27.4s, #1 + sqrshrn2 v12.8h, v28.4s, #1 + sqrshrn2 v13.8h, v29.4s, #1 + sqrshrn2 v14.8h, v30.4s, #1 + sqrshrn2 v15.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + transpose_4x8h v12, v13, v14, v15, v2, v3, v4, v5 + + b 2f +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h + movi \i, #0 +.endr +2: + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 + + movi v4.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + sqrshrn v8.4h, v24.4s, #1 + sqrshrn v9.4h, v25.4s, #1 + sqrshrn v10.4h, v26.4s, #1 + sqrshrn v11.4h, v27.4s, #1 + sqrshrn2 v8.8h, v28.4s, #1 + sqrshrn2 v9.8h, v29.4s, #1 + sqrshrn2 v10.8h, v30.4s, #1 + sqrshrn2 v11.8h, v31.4s, #1 + + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + blr x5 + + mov x6, x0 + load_add_store_8x8 x6, x7 + + mov v16.16b, v8.16b + mov v17.16b, v9.16b + mov v18.16b, v10.16b + mov v19.16b, v11.16b + mov v20.16b, v12.16b + mov v21.16b, v13.16b + mov v22.16b, v14.16b + mov v23.16b, v15.16b + + blr x5 + + add x0, x0, #16 + load_add_store_8x8 x0, x7 + + ldp d14, d15, [sp, #0x30] + ldp d12, d13, [sp, #0x20] + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x40 + br x15 +endfunc + +function inv_txfm_add_8x16_neon + mov x15, x30 + stp d8, d9, [sp, #-0x20]! + stp d10, d11, [sp, #0x10] + ldrh w12, [x13, #4] + + mov x11, #64 + + cmp w3, w12 + ldrh w12, [x13, #2] + b.lt 1f + + add x6, x2, #48 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v28.4h, v16.4s, #1 + sqrshrn v29.4h, v17.4s, #1 + sqrshrn v30.4h, v18.4s, #1 + sqrshrn v31.4h, v19.4s, #1 + sqrshrn2 v28.8h, v20.4s, #1 + sqrshrn2 v29.8h, v21.4s, #1 + sqrshrn2 v30.8h, v22.4s, #1 + sqrshrn2 v31.8h, v23.4s, #1 + transpose_4x8h v28, v29, v30, v31, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v28.8h, v29.8h, v30.8h, v31.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + ldrh w12, [x13, #0] + b.lt 1f + + add x6, x2, #32 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v24.4h, v16.4s, #1 + sqrshrn v25.4h, v17.4s, #1 + sqrshrn v26.4h, v18.4s, #1 + sqrshrn v27.4h, v19.4s, #1 + sqrshrn2 v24.8h, v20.4s, #1 + sqrshrn2 v25.8h, v21.4s, #1 + sqrshrn2 v26.8h, v22.4s, #1 + sqrshrn2 v27.8h, v23.4s, #1 + transpose_4x8h v24, v25, v26, v27, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v24.8h, v25.8h, v26.8h, v27.8h + movi \i, #0 +.endr + +2: + cmp w3, w12 + b.lt 1f + + add x6, x2, #16 + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x6] + st1 {v4.4s}, [x6], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v8.4h, v16.4s, #1 + sqrshrn v9.4h, v17.4s, #1 + sqrshrn v10.4h, v18.4s, #1 + sqrshrn v11.4h, v19.4s, #1 + sqrshrn2 v8.8h, v20.4s, #1 + sqrshrn2 v9.8h, v21.4s, #1 + sqrshrn2 v10.8h, v22.4s, #1 + sqrshrn2 v11.8h, v23.4s, #1 + transpose_4x8h v8, v9, v10, v11, v2, v3, v4, v5 + + b 2f + +1: +.irp i, v8.8h, v9.8h, v10.8h, v11.8h + movi \i, #0 +.endr + +2: + movi v4.4s, #0 + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s + ld1 {\i}, [x2] + st1 {v4.4s}, [x2], x11 +.endr + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + blr x4 + + sqrshrn v16.4h, v16.4s, #1 + sqrshrn v17.4h, v17.4s, #1 + sqrshrn v18.4h, v18.4s, #1 + sqrshrn v19.4h, v19.4s, #1 + sqrshrn2 v16.8h, v20.4s, #1 + sqrshrn2 v17.8h, v21.4s, #1 + sqrshrn2 v18.8h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + mov v20.16b, v8.16b + mov v21.16b, v9.16b + mov v22.16b, v10.16b + mov v23.16b, v11.16b + + blr x5 + + load_add_store_8x16 x0, x6 + + ldp d10, d11, [sp, #0x10] + ldp d8, d9, [sp], 0x20 + + br x15 +endfunc + +const eob_8x16 + .short 10, 43, 75, 128 +endconst + +const eob_8x16_identity1 + .short 4, 64, 96, 128 +endconst + +const eob_8x16_identity2 + .short 4, 8, 12, 128 +endconst + +.macro def_fn_816 w, h, txfm1, txfm2 +function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1 +.ifc \txfm1\()_\txfm2, dct_dct + idct_dc \w, \h, 1 +.endif + adr x4, inv_\txfm1\()_4s_x\w\()_neon + movrel x5, X(inv_\txfm2\()_8h_x\h\()_neon) +.ifc \txfm1, identity +.ifc \txfm2, identity + movrel x13, eob_8x16 +.else + movrel x13, eob_8x16_identity1 +.endif +.else +.ifc \txfm2, identity + movrel x13, eob_8x16_identity2 +.else + movrel x13, eob_8x16 +.endif +.endif +.if \h == 8 + ldrh w13, [x13] +.endif + b inv_txfm_add_\w\()x\h\()_neon +endfunc +.endm + +.macro def_fns_816 w, h +def_fn_816 \w, \h, dct, dct +def_fn_816 \w, \h, identity, identity +def_fn_816 \w, \h, dct, adst +def_fn_816 \w, \h, dct, flipadst +def_fn_816 \w, \h, dct, identity +def_fn_816 \w, \h, adst, dct +def_fn_816 \w, \h, adst, adst +def_fn_816 \w, \h, adst, flipadst +def_fn_816 \w, \h, flipadst, dct +def_fn_816 \w, \h, flipadst, adst +def_fn_816 \w, \h, flipadst, flipadst +def_fn_816 \w, \h, identity, dct +def_fn_816 \w, \h, adst, identity +def_fn_816 \w, \h, flipadst, identity +def_fn_816 \w, \h, identity, adst +def_fn_816 \w, \h, identity, flipadst +.endm + +def_fns_816 8, 16 +def_fns_816 16, 8 + +function inv_dct32_odd_4s_x16_neon + movrel x16, idct_coeffs, 4*16 + ld1 {v0.4s, v1.4s}, [x16], #32 + + mul_mls v2, v16, v31, v0.s[0], v0.s[1] // -> t16a + mul_mla v4, v16, v31, v0.s[1], v0.s[0] // -> t31a + mul_mls v6, v24, v23, v0.s[2], v0.s[3] // -> t17a + srshr v16.4s, v2.4s, #12 // t16a + srshr v31.4s, v4.4s, #12 // t31a + mul_mla v2, v24, v23, v0.s[3], v0.s[2] // -> t30a + mul_mls v4, v20, v27, v1.s[0], v1.s[1] // -> t18a + srshr v24.4s, v6.4s, #12 // t17a + srshr v23.4s, v2.4s, #12 // t30a + mul_mla v6, v20, v27, v1.s[1], v1.s[0] // -> t29a + mul_mls v2, v28, v19, v1.s[2], v1.s[3] // -> t19a + srshr v20.4s, v4.4s, #12 // t18a + srshr v27.4s, v6.4s, #12 // t29a + mul_mla v4, v28, v19, v1.s[3], v1.s[2] // -> t28a + ld1 {v0.4s, v1.4s}, [x16] + sub x16, x16, #4*24 + mul_mls v6, v18, v29, v0.s[0], v0.s[1] // -> t20a + srshr v28.4s, v2.4s, #12 // t19a + srshr v19.4s, v4.4s, #12 // t28a + mul_mla v2, v18, v29, v0.s[1], v0.s[0] // -> t27a + mul_mls v4, v26, v21, v0.s[2], v0.s[3] // -> t21a + srshr v18.4s, v6.4s, #12 // t20a + srshr v29.4s, v2.4s, #12 // t27a + mul_mla v6, v26, v21, v0.s[3], v0.s[2] // -> t26a + mul_mls v2, v22, v25, v1.s[0], v1.s[1] // -> t22a + srshr v26.4s, v4.4s, #12 // t21a + srshr v21.4s, v6.4s, #12 // t26a + mul_mla v4, v22, v25, v1.s[1], v1.s[0] // -> t25a + mul_mls v6, v30, v17, v1.s[2], v1.s[3] // -> t23a + srshr v22.4s, v2.4s, #12 // t22a + srshr v25.4s, v4.4s, #12 // t25a + mul_mla v2, v30, v17, v1.s[3], v1.s[2] // -> t24a + srshr v30.4s, v6.4s, #12 // t23a + srshr v17.4s, v2.4s, #12 // t24a + + ld1 {v0.4s, v1.4s}, [x16] + + sqsub v2.4s, v16.4s, v24.4s // t17 + sqadd v16.4s, v16.4s, v24.4s // t16 + sqsub v3.4s, v31.4s, v23.4s // t30 + sqadd v31.4s, v31.4s, v23.4s // t31 + sqsub v24.4s, v28.4s, v20.4s // t18 + sqadd v28.4s, v28.4s, v20.4s // t19 + sqadd v23.4s, v18.4s, v26.4s // t20 + sqsub v18.4s, v18.4s, v26.4s // t21 + sqsub v20.4s, v30.4s, v22.4s // t22 + sqadd v30.4s, v30.4s, v22.4s // t23 + sqadd v26.4s, v17.4s, v25.4s // t24 + sqsub v17.4s, v17.4s, v25.4s // t25 + sqsub v22.4s, v29.4s, v21.4s // t26 + sqadd v29.4s, v29.4s, v21.4s // t27 + sqadd v25.4s, v19.4s, v27.4s // t28 + sqsub v19.4s, v19.4s, v27.4s // t29 + + mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a + mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a + mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a + srshr v21.4s, v4.4s, #12 // t17a + srshr v27.4s, v6.4s, #12 // t30a + neg v2.4s, v2.4s // -> t18a + mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a + mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a + srshr v19.4s, v2.4s, #12 // t18a + srshr v24.4s, v4.4s, #12 // t29a + mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a + mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a + srshr v22.4s, v6.4s, #12 // t21a + srshr v18.4s, v2.4s, #12 // t26a + neg v4.4s, v4.4s // -> t22a + mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a + srshr v17.4s, v4.4s, #12 // t22a + srshr v20.4s, v6.4s, #12 // t25a + + sqsub v2.4s, v27.4s, v24.4s // t29 + sqadd v27.4s, v27.4s, v24.4s // t30 + sqsub v3.4s, v21.4s, v19.4s // t18 + sqadd v21.4s, v21.4s, v19.4s // t17 + sqsub v24.4s, v16.4s, v28.4s // t19a + sqadd v16.4s, v16.4s, v28.4s // t16a + sqsub v19.4s, v30.4s, v23.4s // t20a + sqadd v30.4s, v30.4s, v23.4s // t23a + sqsub v28.4s, v17.4s, v22.4s // t21 + sqadd v17.4s, v17.4s, v22.4s // t22 + sqadd v23.4s, v26.4s, v29.4s // t24a + sqsub v26.4s, v26.4s, v29.4s // t27a + sqadd v22.4s, v20.4s, v18.4s // t25 + sqsub v20.4s, v20.4s, v18.4s // t26 + sqsub v29.4s, v31.4s, v25.4s // t28a + sqadd v31.4s, v31.4s, v25.4s // t31a + + mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a + mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a + mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19 + srshr v18.4s, v4.4s, #12 // t18a + srshr v25.4s, v6.4s, #12 // t29a + mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28 + mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20 + srshr v29.4s, v2.4s, #12 // t19 + srshr v24.4s, v4.4s, #12 // t28 + neg v6.4s, v6.4s // -> t20 + mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27 + mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a + srshr v26.4s, v6.4s, #12 // t20 + srshr v19.4s, v2.4s, #12 // t27 + neg v4.4s, v4.4s // -> t21a + mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a + srshr v20.4s, v4.4s, #12 // t21a + srshr v28.4s, v6.4s, #12 // t26a + + sqsub v2.4s, v16.4s, v30.4s // t23 + sqadd v16.4s, v16.4s, v30.4s // t16 = out16 + sqsub v3.4s, v31.4s, v23.4s // t24 + sqadd v31.4s, v31.4s, v23.4s // t31 = out31 + sqsub v23.4s, v21.4s, v17.4s // t22a + sqadd v17.4s, v21.4s, v17.4s // t17a = out17 + sqadd v30.4s, v27.4s, v22.4s // t30a = out30 + sqsub v21.4s, v27.4s, v22.4s // t25a + sqsub v27.4s, v18.4s, v20.4s // t21 + sqadd v18.4s, v18.4s, v20.4s // t18 = out18 + sqadd v4.4s, v29.4s, v26.4s // t19a = out19 + sqsub v26.4s, v29.4s, v26.4s // t20a + sqadd v29.4s, v25.4s, v28.4s // t29 = out29 + sqsub v25.4s, v25.4s, v28.4s // t26 + sqadd v28.4s, v24.4s, v19.4s // t28a = out28 + sqsub v24.4s, v24.4s, v19.4s // t27a + mov v19.16b, v4.16b // out19 + + mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20 + mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27 + srshr v20.4s, v4.4s, #12 // t20 + srshr v22.4s, v6.4s, #12 // t27 + + mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a + mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a + mov v27.16b, v22.16b // t27 + srshr v26.4s, v4.4s, #12 // t26a + + mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22 + mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25 + srshr v21.4s, v6.4s, #12 // t21a + srshr v22.4s, v24.4s, #12 // t22 + srshr v25.4s, v4.4s, #12 // t25 + + mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a + mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a + srshr v23.4s, v4.4s, #12 // t23a + srshr v24.4s, v6.4s, #12 // t24a + + ret +endfunc + +.macro def_horz_32 scale=0, shift=2, suffix +function inv_txfm_horz\suffix\()_dct_32x4_neon + mov x14, x30 + movi v7.4s, #0 + lsl x8, x8, #1 +.if \scale + movz w16, #2896*8, lsl #16 + dup v0.2s, w16 +.endif + +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 +.if \scale + scale_input .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct_4s_x16_neon + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5 + transpose_4x4s v28, v29, v30, v31, v2, v3, v4, v5 + +.macro store1 r0, r1, r2, r3 + st1 {\r0}, [x6], #16 + st1 {\r1}, [x6], #16 + st1 {\r2}, [x6], #16 + st1 {\r3}, [x6], #16 +.endm + store1 v16.4s, v20.4s, v24.4s, v28.4s + store1 v17.4s, v21.4s, v25.4s, v29.4s + store1 v18.4s, v22.4s, v26.4s, v30.4s + store1 v19.4s, v23.4s, v27.4s, v31.4s +.purgem store1 + sub x6, x6, #64*4 + + movi v7.4s, #0 +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + ld1 {\i}, [x7] + st1 {v7.4s}, [x7], x8 +.endr +.if \scale + // This relies on the fact that the idct also leaves the right coeff in v0.s[1] + scale_input .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31 +.endif + bl inv_dct32_odd_4s_x16_neon + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + transpose_4x4s v23, v22, v21, v20, v2, v3, v4, v5 + transpose_4x4s v19, v18, v17, v16, v2, v3, v4, v5 +.macro store2 r0, r1, r2, r3, shift + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6] + sqsub v4.4s, v0.4s, \r0 + sqadd v0.4s, v0.4s, \r0 + sqsub v5.4s, v1.4s, \r1 + sqadd v1.4s, v1.4s, \r1 + sqsub v6.4s, v2.4s, \r2 + sqadd v2.4s, v2.4s, \r2 + sqsub v7.4s, v3.4s, \r3 + sqadd v3.4s, v3.4s, \r3 + sqrshrn v0.4h, v0.4s, #\shift + sqrshrn2 v0.8h, v1.4s, #\shift + sqrshrn v1.4h, v2.4s, #\shift + sqrshrn2 v1.8h, v3.4s, #\shift + sqrshrn v2.4h, v7.4s, #\shift + sqrshrn2 v2.8h, v6.4s, #\shift + sqrshrn v3.4h, v5.4s, #\shift + sqrshrn2 v3.8h, v4.4s, #\shift + st1 {v0.8h, v1.8h}, [x6], #32 + rev64 v2.8h, v2.8h + rev64 v3.8h, v3.8h + st1 {v2.8h, v3.8h}, [x6], #32 +.endm + + store2 v31.4s, v27.4s, v23.4s, v19.4s, \shift + store2 v30.4s, v26.4s, v22.4s, v18.4s, \shift + store2 v29.4s, v25.4s, v21.4s, v17.4s, \shift + store2 v28.4s, v24.4s, v20.4s, v16.4s, \shift +.purgem store2 + br x14 +endfunc +.endm + +def_horz_32 scale=0, shift=2 +def_horz_32 scale=1, shift=1, suffix=_scale + +function inv_txfm_add_vert_dct_8x32_neon + mov x14, x30 + lsl x8, x8, #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + + bl X(inv_dct_8h_x16_neon) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + st1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + add x7, x7, x8, lsr #1 + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 + ld1 {v\i\().8h}, [x7], x8 +.endr + sub x7, x7, x8, lsl #4 + sub x7, x7, x8, lsr #1 + bl X(inv_dct32_odd_8h_x16_neon) + + neg x9, x8 + mov x10, x6 + movi v0.8h, #0 + mvni v1.8h, #0xfc, lsl #8 // 0x3ff +.macro combine r0, r1, r2, r3, op, stride + ld1 {v5.8h}, [x7], \stride + ld1 {v2.8h}, [x10], x1 + ld1 {v6.8h}, [x7], \stride + ld1 {v3.8h}, [x10], x1 + \op v5.8h, v5.8h, \r0 + ld1 {v7.8h}, [x7], \stride + ld1 {v4.8h}, [x10], x1 + srshr v5.8h, v5.8h, #4 + \op v6.8h, v6.8h, \r1 + sqadd v5.8h, v5.8h, v2.8h + srshr v6.8h, v6.8h, #4 + \op v7.8h, v7.8h, \r2 + smax v2.8h, v5.8h, v0.8h + ld1 {v5.8h}, [x7], \stride + sqadd v6.8h, v6.8h, v3.8h + smin v2.8h, v2.8h, v1.8h + srshr v7.8h, v7.8h, #4 + \op v5.8h, v5.8h, \r3 + st1 {v2.8h}, [x6], x1 + ld1 {v2.8h}, [x10], x1 + smax v3.8h, v6.8h, v0.8h + sqadd v7.8h, v7.8h, v4.8h + smin v3.8h, v3.8h, v1.8h + srshr v5.8h, v5.8h, #4 + st1 {v3.8h}, [x6], x1 + smax v4.8h, v7.8h, v0.8h + sqadd v5.8h, v5.8h, v2.8h + smin v4.8h, v4.8h, v1.8h + st1 {v4.8h}, [x6], x1 + smax v2.8h, v5.8h, v0.8h + smin v2.8h, v2.8h, v1.8h + st1 {v2.8h}, [x6], x1 +.endm + combine v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8 + combine v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8 + combine v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8 + combine v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8 + sub x7, x7, x8 + combine v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9 + combine v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9 + combine v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9 + combine v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9 +.purgem combine + + br x14 +endfunc + +const eob_32x32 + .short 10, 36, 78, 136, 210, 300, 406, 1024 +endconst + +const eob_16x32 + .short 10, 36, 78, 151, 215, 279, 343, 512 +endconst + +const eob_16x32_shortside + .short 10, 36, 78, 512 +endconst + +const eob_8x32 + .short 10, 43, 75, 107, 139, 171, 203, 256 +endconst + +function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1 + movi v0.8h, #0 + movi v1.8h, #0 + movrel x13, eob_32x32, 2 + + mov x8, #4*32 +1: + mov w9, #0 + movrel x12, eob_32x32, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + load_add_store_8x8 x0, x7, shiftbits=2 + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc + +.macro shift_16_regs op, shift +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + \op \i, \i, #\shift +.endr +.endm + +.macro def_identity_1632 w, h, wshort, hshort +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movz w16, #2896*8, lsl #16 + movz w17, #2*(5793-4096)*8, lsl #16 + movi v0.4s, #0 + movi v1.4s, #0 + movrel x13, eob_16x32\hshort, 2 + + mov x8, #4*\h +1: + mov w9, #0 + movrel x12, eob_16x32\wshort, 2 +2: + add w9, w9, #8 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + dup v2.2s, w16 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + mov v2.s[1], w17 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + scale_input .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + scale_input .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31 + +.if \w == 16 + // 16x32 + identity_4x16_shift1 v2.s[1] +.else + // 32x16 + shift_16_regs sqshl, 1 + identity_4x16 v2.s[1] +.endif + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + +.if \w == 16 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=4 +.endif + ldrh w11, [x12], #4 + sub x0, x0, x1, lsl #3 + add x0, x0, #16 + cmp w3, w11 + b.ge 2b + + ldrh w11, [x13], #4 + cmp w3, w11 + b.lt 9f + + sub x0, x0, w9, uxtw #1 + add x0, x0, x1, lsl #3 + msub x2, x8, x9, x2 + add x2, x2, #4*8 + b 1b +9: + ret +endfunc +.endm + +def_identity_1632 16, 32, _shortside, +def_identity_1632 32, 16, , _shortside + +.macro def_identity_832 w, h +function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1 + movi v0.4s, #0 + movi v1.4s, #0 + // Working on 8x8 blocks, read every other entry from eob_8x32 + movrel x13, eob_8x32, 2 + + mov w8, #4*\h +1: + // Working on 8x8 blocks, read every other entry from eob_8x32 + ldrh w12, [x13], #4 + ld1 {v16.4s, v17.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v18.4s, v19.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v20.4s, v21.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v22.4s, v23.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v24.4s, v25.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v26.4s, v27.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v28.4s, v29.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + ld1 {v30.4s, v31.4s}, [x2] + st1 {v0.4s, v1.4s}, [x2], x8 + +.if \w == 8 + sqrshrn v16.4h, v16.4s, #1 + sqrshrn2 v16.8h, v17.4s, #1 + sqrshrn v17.4h, v18.4s, #1 + sqrshrn2 v17.8h, v19.4s, #1 + sqrshrn v18.4h, v20.4s, #1 + sqrshrn2 v18.8h, v21.4s, #1 + sqrshrn v19.4h, v22.4s, #1 + sqrshrn2 v19.8h, v23.4s, #1 + sqrshrn v20.4h, v24.4s, #1 + sqrshrn2 v20.8h, v25.4s, #1 + sqrshrn v21.4h, v26.4s, #1 + sqrshrn2 v21.8h, v27.4s, #1 + sqrshrn v22.4h, v28.4s, #1 + sqrshrn2 v22.8h, v29.4s, #1 + sqrshrn v23.4h, v30.4s, #1 + sqrshrn2 v23.8h, v31.4s, #1 +.else + sqxtn v16.4h, v16.4s + sqxtn2 v16.8h, v17.4s + sqxtn v17.4h, v18.4s + sqxtn2 v17.8h, v19.4s + sqxtn v18.4h, v20.4s + sqxtn2 v18.8h, v21.4s + sqxtn v19.4h, v22.4s + sqxtn2 v19.8h, v23.4s + sqxtn v20.4h, v24.4s + sqxtn2 v20.8h, v25.4s + sqxtn v21.4h, v26.4s + sqxtn2 v21.8h, v27.4s + sqxtn v22.4h, v28.4s + sqxtn2 v22.8h, v29.4s + sqxtn v23.4h, v30.4s + sqxtn2 v23.8h, v31.4s +.endif + + transpose_8x8h v16, v17, v18, v19, v20, v21, v22, v23, v4, v5 + + + cmp w3, w12 +.if \w == 8 + load_add_store_8x8 x0, x7, shiftbits=2 +.else + load_add_store_8x8 x0, x7, shiftbits=3 +.endif + + b.lt 9f +.if \w == 8 + sub x2, x2, x8, lsl #3 + add x2, x2, #4*8 +.else + sub x0, x0, x1, lsl #3 + add x0, x0, #2*8 +.endif + b 1b + +9: + ret +endfunc +.endm + +def_identity_832 8, 32 +def_identity_832 32, 8 + +function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1 + idct_dc 32, 32, 2 + + mov x15, x30 + sub sp, sp, #2048 + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #2048 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1 + idct_dc 16, 32, 1 + + mov x15, x30 + sub sp, sp, #1024 + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + adr x4, inv_dct_4s_x16_neon + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, sp, #(\i*16*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endif + mov x8, #4*32 + bl inv_txfm_horz_scale_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #16*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1 + idct_dc 32, 16, 1 + + mov x15, x30 + sub sp, sp, #1024 + + movrel x13, eob_16x32 + movrel x5, X(inv_dct_8h_x16_neon) + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + mov x8, #4*16 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x6, x0, #(\i*2) + add x7, sp, #(\i*2) + mov x8, #32*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, sp, #1024 + br x15 +endfunc + +function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1 + idct_dc 8, 32, 2 + + mov x15, x30 + sub sp, sp, #512 + + movrel x13, eob_8x32 + + movi v28.4s, #0 + mov x8, #4*32 + mov w9, #32 + mov x6, sp + mov x7, x2 +1: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().4s}, [x7] + st1 {v28.4s}, [x7], x8 +.endr + ldrh w12, [x13], #2 + sub w9, w9, #4 + sub x7, x7, x8, lsl #3 + add x7, x7, #4*4 + + bl inv_dct_4s_x8_neon + + sqrshrn v16.4h, v16.4s, #2 + sqrshrn v17.4h, v17.4s, #2 + sqrshrn v18.4h, v18.4s, #2 + sqrshrn v19.4h, v19.4s, #2 + sqrshrn2 v16.8h, v20.4s, #2 + sqrshrn2 v17.8h, v21.4s, #2 + sqrshrn2 v18.8h, v22.4s, #2 + sqrshrn2 v19.8h, v23.4s, #2 + + transpose_4x8h v16, v17, v18, v19, v2, v3, v4, v5 + + cmp w3, w12 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64 + + b.ge 1b + cbz w9, 3f + + movi v29.8h, #0 + movi v30.8h, #0 + movi v31.8h, #0 +2: + subs w9, w9, #4 + st1 {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64 + b.gt 2b + +3: + mov x6, x0 + mov x7, sp + mov x8, #8*2 + bl inv_txfm_add_vert_dct_8x32_neon + + add sp, sp, #512 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1 + idct_dc 32, 8, 2 + + mov x15, x30 + sub sp, sp, #512 + +.irp i, 0, 4 + add x6, sp, #(\i*32*2) + add x7, x2, #(\i*4) +.if \i > 0 + cmp w3, #10 + b.lt 1f +.endif + mov x8, #8*4 + bl inv_txfm_horz_dct_32x4_neon +.endr + b 2f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + +2: + mov x8, #2*32 + mov w9, #0 +1: + add x6, x0, x9, lsl #1 + add x7, sp, x9, lsl #1 // #(\i*2) + +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + ld1 {v\i\().8h}, [x7], x8 +.endr + add w9, w9, #8 + + bl X(inv_dct_8h_x8_neon) + + cmp w9, #32 + + load_add_store_8x8 x6, x7 + + b.lt 1b + + add sp, sp, #512 + br x15 +endfunc + +function inv_dct64_step1_neon + // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a + // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a + // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a + // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a + + ld1 {v0.4s, v1.4s}, [x17], #32 + + sqrdmulh v23.4s, v16.4s, v0.s[1] // t63a + sqrdmulh v16.4s, v16.4s, v0.s[0] // t32a + sqrdmulh v22.4s, v17.4s, v0.s[2] // t62a + sqrdmulh v17.4s, v17.4s, v0.s[3] // t33a + sqrdmulh v21.4s, v18.4s, v1.s[1] // t61a + sqrdmulh v18.4s, v18.4s, v1.s[0] // t34a + sqrdmulh v20.4s, v19.4s, v1.s[2] // t60a + sqrdmulh v19.4s, v19.4s, v1.s[3] // t35a + + ld1 {v0.4s}, [x17], #16 + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t33 + sqsub v26.4s, v19.4s, v18.4s // t34 + sqadd v27.4s, v19.4s, v18.4s // t35 + sqadd v28.4s, v20.4s, v21.4s // t60 + sqsub v29.4s, v20.4s, v21.4s // t61 + sqsub v30.4s, v23.4s, v22.4s // t62 + sqadd v31.4s, v23.4s, v22.4s // t63 + + mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a + mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a + neg v2.4s, v2.4s // t34a + mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a + srshr v26.4s, v2.4s, #12 // t34a + mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a + srshr v29.4s, v4.4s, #12 // t61a + srshr v25.4s, v6.4s, #12 // t33a + srshr v30.4s, v2.4s, #12 // t62a + + sqadd v16.4s, v24.4s, v27.4s // t32a + sqsub v19.4s, v24.4s, v27.4s // t35a + sqadd v17.4s, v25.4s, v26.4s // t33 + sqsub v18.4s, v25.4s, v26.4s // t34 + sqsub v20.4s, v31.4s, v28.4s // t60a + sqadd v23.4s, v31.4s, v28.4s // t63a + sqsub v21.4s, v30.4s, v29.4s // t61 + sqadd v22.4s, v30.4s, v29.4s // t62 + + mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a + mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a + mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60 + srshr v21.4s, v2.4s, #12 // t61a + srshr v18.4s, v4.4s, #12 // t34a + mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35 + srshr v20.4s, v6.4s, #12 // t60 + srshr v19.4s, v2.4s, #12 // t35 + + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64 + + ret +endfunc + +function inv_dct64_step2_neon + movrel x16, idct_coeffs + ld1 {v0.4s}, [x16] +1: + // t32a/33/34a/35/60/61a/62/63a + // t56a/57/58a/59/36/37a/38/39a + // t40a/41/42a/43/52/53a/54/55a + // t48a/49/50a/51/44/45a/46/47a + ldr q16, [x6, #4*4*0] // t32a + ldr q17, [x9, #4*4*8] // t39a + ldr q18, [x9, #4*4*0] // t63a + ldr q19, [x6, #4*4*8] // t56a + ldr q20, [x6, #4*4*16] // t40a + ldr q21, [x9, #4*4*24] // t47a + ldr q22, [x9, #4*4*16] // t55a + ldr q23, [x6, #4*4*24] // t48a + + sqadd v24.4s, v16.4s, v17.4s // t32 + sqsub v25.4s, v16.4s, v17.4s // t39 + sqadd v26.4s, v18.4s, v19.4s // t63 + sqsub v27.4s, v18.4s, v19.4s // t56 + sqsub v28.4s, v21.4s, v20.4s // t40 + sqadd v29.4s, v21.4s, v20.4s // t47 + sqadd v30.4s, v23.4s, v22.4s // t48 + sqsub v31.4s, v23.4s, v22.4s // t55 + + mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a + mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a + mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a + srshr v25.4s, v2.4s, #12 // t56a + srshr v27.4s, v4.4s, #12 // t39a + neg v6.4s, v6.4s // t40a + mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a + srshr v31.4s, v6.4s, #12 // t40a + srshr v28.4s, v2.4s, #12 // t55a + + sqadd v16.4s, v24.4s, v29.4s // t32a + sqsub v19.4s, v24.4s, v29.4s // t47a + sqadd v17.4s, v27.4s, v31.4s // t39 + sqsub v18.4s, v27.4s, v31.4s // t40 + sqsub v20.4s, v26.4s, v30.4s // t48a + sqadd v23.4s, v26.4s, v30.4s // t63a + sqsub v21.4s, v25.4s, v28.4s // t55 + sqadd v22.4s, v25.4s, v28.4s // t56 + + mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a + mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a + mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47 + srshr v18.4s, v2.4s, #12 // t40a + srshr v21.4s, v4.4s, #12 // t55a + mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48 + srshr v19.4s, v6.4s, #12 // t47 + srshr v20.4s, v2.4s, #12 // t48 + + str q16, [x6, #4*4*0] // t32a + str q17, [x9, #4*4*0] // t39 + str q18, [x6, #4*4*8] // t40a + str q19, [x9, #4*4*8] // t47 + str q20, [x6, #4*4*16] // t48 + str q21, [x9, #4*4*16] // t55a + str q22, [x6, #4*4*24] // t56 + str q23, [x9, #4*4*24] // t63a + + add x6, x6, #4*4 + sub x9, x9, #4*4 + cmp x6, x9 + b.lt 1b + ret +endfunc + +.macro load8 src, strd, zero, clear +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s +.if \clear + ld1 {\i}, [\src] + st1 {\zero}, [\src], \strd +.else + ld1 {\i}, [\src], \strd +.endif +.endr +.endm + +.macro store16 dst +.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + st1 {\i}, [\dst], #16 +.endr +.endm + +.macro clear_upper8 +.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s + movi \i, #0 +.endr +.endm + +.macro movi_if reg, val, cond +.if \cond + movi \reg, \val +.endif +.endm + +.macro movz16dup_if reg, gpr, val, cond +.if \cond + movz \gpr, \val, lsl #16 + dup \reg, \gpr +.endif +.endm + +.macro st1_if regs, dst, cond +.if \cond + st1 \regs, \dst +.endif +.endm + +.macro str_if reg, dst, cond +.if \cond + str \reg, \dst +.endif +.endm + +.macro stroff_if reg, dst, dstoff, cond +.if \cond + str \reg, \dst, \dstoff +.endif +.endm + +.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7 +.if \cond + scale_input .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7 +.endif +.endm + +.macro def_dct64_func suffix, clear=0, scale=0 +function inv_txfm_dct\suffix\()_4s_x64_neon + mov x14, x30 + mov x6, sp + lsl x8, x8, #2 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + add x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct_4s_x16_neon + + store16 x6 + + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.8h, #0, \clear + load8 x7, x8, v7.4s, \clear + clear_upper8 + sub x7, x7, x8, lsl #3 + lsr x8, x8, #1 + sub x7, x7, x8, lsr #1 + scale_if \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23 + + bl inv_dct32_odd_4s_x16_neon + + add x10, x6, #16*15 + sub x6, x6, #16*16 + + mov x9, #-16 + +.macro store_addsub r0, r1, r2, r3 + ld1 {v2.4s}, [x6], #16 + ld1 {v3.4s}, [x6], #16 + sqadd v6.4s, v2.4s, \r0 + sqsub \r0, v2.4s, \r0 + ld1 {v4.4s}, [x6], #16 + sqadd v7.4s, v3.4s, \r1 + sqsub \r1, v3.4s, \r1 + ld1 {v5.4s}, [x6], #16 + sqadd v2.4s, v4.4s, \r2 + sub x6, x6, #16*4 + sqsub \r2, v4.4s, \r2 + st1 {v6.4s}, [x6], #16 + st1 {\r0}, [x10], x9 + sqadd v3.4s, v5.4s, \r3 + sqsub \r3, v5.4s, \r3 + st1 {v7.4s}, [x6], #16 + st1 {\r1}, [x10], x9 + st1 {v2.4s}, [x6], #16 + st1 {\r2}, [x10], x9 + st1 {v3.4s}, [x6], #16 + st1 {\r3}, [x10], x9 +.endm + store_addsub v31.4s, v30.4s, v29.4s, v28.4s + store_addsub v27.4s, v26.4s, v25.4s, v24.4s + store_addsub v23.4s, v22.4s, v21.4s, v20.4s + store_addsub v19.4s, v18.4s, v17.4s, v16.4s +.purgem store_addsub + + add x6, x6, #4*4*16 + + movrel x17, idct64_coeffs + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x9, x7, x8, lsl #4 // offset 16 + add x10, x7, x8, lsl #3 // offset 8 + sub x9, x9, x8 // offset 15 + sub x11, x10, x8 // offset 7 + ld1 {v16.4s}, [x7] // in1 (offset 0) + ld1 {v17.4s}, [x9] // in31 (offset 15) + ld1 {v18.4s}, [x10] // in17 (offset 8) + ld1 {v19.4s}, [x11] // in15 (offset 7) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + add x7, x7, x8, lsl #2 // offset 4 + sub x9, x9, x8, lsl #2 // offset 11 + sub x10, x7, x8 // offset 3 + add x11, x9, x8 // offset 12 + ld1 {v16.4s}, [x10] // in7 (offset 3) + ld1 {v17.4s}, [x11] // in25 (offset 12) + ld1 {v18.4s}, [x9] // in23 (offset 11) + ld1 {v19.4s}, [x7] // in9 (offset 4) + st1_if {v7.4s}, [x7], \clear + st1_if {v7.4s}, [x9], \clear + st1_if {v7.4s}, [x10], \clear + st1_if {v7.4s}, [x11], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + sub x10, x10, x8, lsl #1 // offset 1 + sub x9, x9, x8, lsl #1 // offset 9 + add x7, x7, x8 // offset 5 + add x11, x11, x8 // offset 13 + ldr q16, [x10, x8] // in5 (offset 2) + ldr q17, [x11] // in27 (offset 13) + ldr q18, [x9, x8] // in21 (offset 10) + ldr q19, [x7] // in11 (offset 5) + stroff_if q7, [x10, x8], \clear + str_if q7, [x11], \clear + stroff_if q7, [x9, x8], \clear + str_if q7, [x7], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + movz16dup_if v0.2s, w16, #2896*8, \scale + movi_if v7.4s, #0, \clear + ldr q16, [x10] // in3 (offset 1) + ldr q17, [x11, x8] // in29 (offset 14) + ldr q18, [x9] // in19 (offset 9) + ldr q19, [x7, x8] // in13 (offset 6) + str_if q7, [x10], \clear + stroff_if q7, [x11, x8], \clear + str_if q7, [x9], \clear + stroff_if q7, [x7, x8], \clear + scale_if \scale, v0.s[0], v16, v17, v18, v19 + bl inv_dct64_step1_neon + + sub x6, x6, #4*4*32 + add x9, x6, #4*4*7 + + bl inv_dct64_step2_neon + + br x14 +endfunc +.endm + +def_dct64_func _clear, clear=1 +def_dct64_func _clear_scale, clear=1, scale=1 + + +function inv_txfm_horz_dct_64x4_neon + mov x14, x30 + + mov x7, sp + add x8, sp, #4*4*(64 - 4) + add x9, x6, #2*56 + mov x10, #2*64 + mov x11, #-4*4*4 + + dup v7.4s, w12 +1: + ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64 + ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11 + ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64 + ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11 + transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5 + transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5 + transpose_4x4s v31, v30, v29, v28, v2, v3, v4, v5 + transpose_4x4s v27, v26, v25, v24, v2, v3, v4, v5 + +.macro store_addsub src0, src1, src2, src3 + sqsub v1.4s, \src0, \src1 + sqadd v0.4s, \src0, \src1 + sqsub v3.4s, \src2, \src3 + srshl v1.4s, v1.4s, v7.4s + sqadd v2.4s, \src2, \src3 + srshl v3.4s, v3.4s, v7.4s + srshl v0.4s, v0.4s, v7.4s + srshl v2.4s, v2.4s, v7.4s + sqxtn v3.4h, v3.4s + sqxtn2 v3.8h, v1.4s + sqxtn v0.4h, v0.4s + sqxtn2 v0.8h, v2.4s + rev64 v3.8h, v3.8h + st1 {v0.8h}, [x6], x10 + st1 {v3.8h}, [x9], x10 +.endm + store_addsub v16.4s, v31.4s, v20.4s, v27.4s + store_addsub v17.4s, v30.4s, v21.4s, v26.4s + store_addsub v18.4s, v29.4s, v22.4s, v25.4s + store_addsub v19.4s, v28.4s, v23.4s, v24.4s +.purgem store_addsub + sub x6, x6, x10, lsl #2 + sub x9, x9, x10, lsl #2 + add x6, x6, #16 + sub x9, x9, #16 + + cmp x7, x8 + b.lt 1b + br x14 +endfunc + +function inv_txfm_add_vert_dct_8x64_neon + mov x14, x30 + lsl x8, x8, #1 + + mov x7, sp + add x8, sp, #2*8*(64 - 4) + add x9, x6, x1, lsl #6 + sub x9, x9, x1 + neg x10, x1 + mov x11, #-2*8*4 + +1: + ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64 + ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11 + ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64 + ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11 + + movi v6.8h, #0 + mvni v7.8h, #0xfc, lsl #8 // 0x3ff +.macro add_dest_addsub src0, src1, src2, src3 + ld1 {v0.8h}, [x6], x1 + ld1 {v1.8h}, [x9], x10 + sqadd v4.8h, \src0, \src1 + ld1 {v2.8h}, [x6] + sqsub \src0, \src0, \src1 + ld1 {v3.8h}, [x9] + sqadd v5.8h, \src2, \src3 + sqsub \src2, \src2, \src3 + sub x6, x6, x1 + sub x9, x9, x10 + srshr v4.8h, v4.8h, #4 + srshr v5.8h, v5.8h, #4 + srshr \src0, \src0, #4 + sqadd v0.8h, v0.8h, v4.8h + srshr \src2, \src2, #4 + sqadd v1.8h, v1.8h, \src0 + sqadd v2.8h, v2.8h, v5.8h + smax v0.8h, v0.8h, v6.8h + sqadd v3.8h, v3.8h, \src2 + smax v1.8h, v1.8h, v6.8h + smin v0.8h, v0.8h, v7.8h + smax v2.8h, v2.8h, v6.8h + smin v1.8h, v1.8h, v7.8h + st1 {v0.8h}, [x6], x1 + smax v3.8h, v3.8h, v6.8h + smin v2.8h, v2.8h, v7.8h + st1 {v1.8h}, [x9], x10 + smin v3.8h, v3.8h, v7.8h + st1 {v2.8h}, [x6], x1 + st1 {v3.8h}, [x9], x10 +.endm + add_dest_addsub v16.8h, v31.8h, v17.8h, v30.8h + add_dest_addsub v18.8h, v29.8h, v19.8h, v28.8h + add_dest_addsub v20.8h, v27.8h, v21.8h, v26.8h + add_dest_addsub v22.8h, v25.8h, v23.8h, v24.8h +.purgem add_dest_addsub + cmp x7, x8 + b.lt 1b + + br x14 +endfunc + +.macro sub_sp space +#ifdef _WIN32 +.if \space > 8192 + // Here, we'd need to touch two (or more) pages while decrementing + // the stack pointer. + .error "sub_sp_align doesn't support values over 8K at the moment" +.elseif \space > 4096 + sub x16, sp, #4096 + ldr xzr, [x16] + sub sp, x16, #(\space - 4096) +.else + sub sp, sp, #\space +.endif +#else +.if \space >= 4096 + sub sp, sp, #(\space)/4096*4096 +.endif +.if (\space % 4096) != 0 + sub sp, sp, #(\space)%4096 +.endif +#endif +.endm + +function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1 + idct_dc 64, 64, 2 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x7, x5, #(\i*2) + mov x8, #64*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1 + idct_dc 64, 32, 1 + + mov x15, x30 + + sub_sp 64*32*2+64*4*4 + add x5, sp, #64*4*4 + + movrel x13, eob_32x32 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*64*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + mov x12, #-1 // shift + bl inv_txfm_dct_clear_scale_4s_x64_neon + add x6, x5, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 28 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x5, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_dct_8x32_neon +.endr + + add sp, x5, #64*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1 + idct_dc 32, 64, 1 + + mov x15, x30 + + sub_sp 32*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_32x32 + ldrh w12, [x13], #2 + +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*32*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_scale_dct_32x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8, 16, 24 + add x7, x5, #(\i*2) + mov x8, #32*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #32*32*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1 + idct_dc 64, 16, 2 + + mov x15, x30 + + sub_sp 64*16*2+64*4*4 + add x4, sp, #64*4*4 + + movrel x13, eob_16x32 + +.irp i, 0, 4, 8, 12 + add x6, x4, #(\i*64*2) +.if \i > 0 + mov w8, #(16 - \i) + cmp w3, w12 + b.lt 1f +.endif + add x7, x2, #(\i*4) + mov x8, #16*4 + mov x12, #-2 // shift + bl inv_txfm_dct_clear_4s_x64_neon + add x6, x4, #(\i*64*2) + bl inv_txfm_horz_dct_64x4_neon +.if \i < 12 + ldrh w12, [x13], #2 +.endif +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #2 +.rept 4 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: + movrel x5, X(inv_dct_8h_x16_neon) +.irp i, 0, 8, 16, 24, 32, 40, 48, 56 + add x6, x0, #(\i*2) + add x7, x4, #(\i*2) + mov x8, #64*2 + bl inv_txfm_add_vert_8x16_neon +.endr + + add sp, x4, #64*16*2 + br x15 +endfunc + +function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1 + idct_dc 16, 64, 2 + + mov x15, x30 + + sub_sp 16*32*2+64*8*2 + add x5, sp, #64*8*2 + + movrel x13, eob_16x32 + ldrh w12, [x13], #2 + + adr x4, inv_dct_4s_x16_neon +.irp i, 0, 4, 8, 12, 16, 20, 24, 28 + add x6, x5, #(\i*16*2) +.if \i > 0 + mov w8, #(32 - \i) + cmp w3, w12 + b.lt 1f + ldrh w12, [x13], #2 +.endif + add x7, x2, #(\i*4) + mov x8, #32*4 + bl inv_txfm_horz_16x4_neon +.endr + b 3f + +1: + movi v4.8h, #0 + movi v5.8h, #0 + movi v6.8h, #0 + movi v7.8h, #0 +2: + subs w8, w8, #4 +.rept 2 + st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64 +.endr + b.gt 2b + +3: +.irp i, 0, 8 + add x7, x5, #(\i*2) + mov x8, #16*2 + bl X(inv_txfm_dct_8h_x64_neon) + add x6, x0, #(\i*2) + bl inv_txfm_add_vert_dct_8x64_neon +.endr + + add sp, x5, #16*32*2 + br x15 +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/64/mc.S b/ffmpeg/JNI/dav1d/src/arm/64/mc.S index 92aa8aa81..f6970de3c 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/mc.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/mc.S @@ -3089,3 +3089,161 @@ endfunc warp , 11 warp t, 7 + +// void dav1d_emu_edge_8bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_8bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.16b}, [x8] + mov x12, x6 // out = dst + mov x3, x4 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.16b, v1.16b}, [x13], #32 + subs x3, x3, #32 + st1 {v0.16b, v1.16b}, [x12], #32 + b.gt 1b +.if \need_right + add x3, x8, x2 // in + center_w + sub x3, x3, #1 // in + center_w - 1 + add x12, x6, x4 // dst + left_ext + ld1r {v0.16b}, [x3] + add x12, x12, x2 // out = dst + left_ext + center_w + mov x3, x11 +1: + subs x3, x3, #16 + st1 {v0.16b}, [x12], #16 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.16b, v1.16b}, [x8], #32 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.16b, v1.16b}, [x14], #32 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.16b, v1.16b}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #32 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/64/mc16.S b/ffmpeg/JNI/dav1d/src/arm/64/mc16.S index 5fbc3989c..7ac186302 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/mc16.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/mc16.S @@ -3407,3 +3407,163 @@ endfunc warp warp t + +// void dav1d_emu_edge_16bpc_neon( +// const intptr_t bw, const intptr_t bh, +// const intptr_t iw, const intptr_t ih, +// const intptr_t x, const intptr_t y, +// pixel *dst, const ptrdiff_t dst_stride, +// const pixel *ref, const ptrdiff_t ref_stride) +function emu_edge_16bpc_neon, export=1 + ldp x8, x9, [sp] + + // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) + // ref += iclip(x, 0, iw - 1) + sub x12, x3, #1 // ih - 1 + cmp x5, x3 + sub x13, x2, #1 // iw - 1 + csel x12, x12, x5, ge // min(y, ih - 1) + cmp x4, x2 + bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0) + csel x13, x13, x4, ge // min(x, iw - 1) + bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0) + madd x8, x12, x9, x8 // ref += iclip() * stride + add x8, x8, x13, lsl #1 // ref += iclip() + + // bottom_ext = iclip(y + bh - ih, 0, bh - 1) + // top_ext = iclip(-y, 0, bh - 1) + add x10, x5, x1 // y + bh + neg x5, x5 // -y + sub x10, x10, x3 // y + bh - ih + sub x12, x1, #1 // bh - 1 + cmp x10, x1 + bic x5, x5, x5, asr #63 // max(-y, 0) + csel x10, x10, x12, lt // min(y + bh - ih, bh-1) + cmp x5, x1 + bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0) + csel x5, x5, x12, lt // min(max(-y, 0), bh-1) + + // right_ext = iclip(x + bw - iw, 0, bw - 1) + // left_ext = iclip(-x, 0, bw - 1) + add x11, x4, x0 // x + bw + neg x4, x4 // -x + sub x11, x11, x2 // x + bw - iw + sub x13, x0, #1 // bw - 1 + cmp x11, x0 + bic x4, x4, x4, asr #63 // max(-x, 0) + csel x11, x11, x13, lt // min(x + bw - iw, bw-1) + cmp x4, x0 + bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0) + csel x4, x4, x13, lt // min(max(-x, 0), bw - 1) + + // center_h = bh - top_ext - bottom_ext + // dst += top_ext * PXSTRIDE(dst_stride) + // center_w = bw - left_ext - right_ext + sub x1, x1, x5 // bh - top_ext + madd x6, x5, x7, x6 + sub x2, x0, x4 // bw - left_ext + sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext + sub x2, x2, x11 // center_w = bw - left_ext - right_ext + + mov x14, x6 // backup of dst + +.macro v_loop need_left, need_right +0: +.if \need_left + ld1r {v0.8h}, [x8] + mov x12, x6 // out = dst + mov x3, x4 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + mov x13, x8 + add x12, x6, x4, lsl #1 // out = dst + left_ext + mov x3, x2 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64 + subs x3, x3, #32 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64 + b.gt 1b +.if \need_right + add x3, x8, x2, lsl #1 // in + center_w + sub x3, x3, #2 // in + center_w - 1 + add x12, x6, x4, lsl #1 // dst + left_ext + ld1r {v0.8h}, [x3] + add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w + mov x3, x11 + mov v1.16b, v0.16b +1: + subs x3, x3, #16 + st1 {v0.8h, v1.8h}, [x12], #32 + b.gt 1b +.endif + + subs x1, x1, #1 // center_h-- + add x6, x6, x7 + add x8, x8, x9 + b.gt 0b +.endm + + cbz x4, 2f + // need_left + cbz x11, 3f + // need_left + need_right + v_loop 1, 1 + b 5f + +2: + // !need_left + cbz x11, 4f + // !need_left + need_right + v_loop 0, 1 + b 5f + +3: + // need_left + !need_right + v_loop 1, 0 + b 5f + +4: + // !need_left + !need_right + v_loop 0, 0 + +5: + + cbz x10, 3f + // need_bottom + sub x8, x6, x7 // ref = dst - stride + mov x4, x0 +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64 + mov x3, x10 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x10, x6 // dst -= bottom_ext * stride + subs x4, x4, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + cbz x5, 3f + // need_top + msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride +1: + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64 + mov x3, x5 +2: + subs x3, x3, #1 + st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7 + b.gt 2b + msub x6, x7, x5, x6 // dst -= top_ext * stride + subs x0, x0, #32 // bw -= 32 + add x6, x6, #64 // dst += 32 + b.gt 1b + +3: + ret +endfunc diff --git a/ffmpeg/JNI/dav1d/src/arm/64/msac.S b/ffmpeg/JNI/dav1d/src/arm/64/msac.S index 31cc46f89..3a6cf900a 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/msac.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/msac.S @@ -118,9 +118,9 @@ endconst .endm .macro str_n idx0, idx1, dstreg, dstoff, n - str q\idx0, [\dstreg, \dstoff] + str \idx0, [\dstreg, \dstoff] .if \n == 16 - str q\idx1, [\dstreg, \dstoff + 16] + str \idx1, [\dstreg, \dstoff + 16] .endif .endm @@ -150,7 +150,7 @@ function msac_decode_symbol_adapt4_neon, export=1 ld1r {v6.8h}, [x8] // dif >> (EC_WIN_SIZE - 16) movrel x8, bits - str_n 4, 5, sp, #16, \n // store v values to allow indexed access + str_n q4, q5, sp, #16, \n // store v values to allow indexed access ld1_n v16, v17, x8, .8h, \n @@ -185,7 +185,7 @@ function msac_decode_symbol_adapt4_neon, export=1 sbc w4, w4, w14 // -((count >> 4) + (n_symbols > 2) + 4) .endif sub_n v4, v5, v4, v5, v0, v1, \sz, \n // (32768 - cdf[i]) or (-1 - cdf[i]) - dup v6.8h, w4 // -rate + dup v6\sz, w4 // -rate sub w3, w3, w3, lsr #5 // count - (count == 32) sub_n v0, v1, v0, v1, v2, v3, \sz, \n // cdf + (i >= val ? 1 : 0) @@ -216,7 +216,7 @@ L(renorm2): lsl x7, x7, x5 // (~dif + (v << 48)) << d str w4, [x0, #RNG] mvn x7, x7 // ~dif - b.ge 9f + b.hs 9f // refill ldp x3, x4, [x0] // BUF_POS, BUF_END @@ -274,6 +274,128 @@ function msac_decode_symbol_adapt16_neon, export=1 b L(renorm) endfunc +function msac_decode_hi_tok_neon, export=1 + ld1 {v0.4h}, [x1] // cdf + add x16, x0, #RNG + movi v31.4h, #0x7f, lsl #8 // 0x7f00 + movrel x17, coeffs, 30-2*3 + mvni v30.4h, #0x3f // 0xffc0 + ldrh w9, [x1, #6] // count = cdf[n_symbols] + ld1r {v3.4h}, [x16] // rng + movrel x16, bits + ld1 {v29.4h}, [x17] // EC_MIN_PROB * (n_symbols - ret) + add x17, x0, #DIF + 6 + ld1 {v16.8h}, [x16] + mov w13, #-24 + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + ldr w10, [x0, #ALLOW_UPDATE_CDF] + ld1r {v1.8h}, [x17] // dif >> (EC_WIN_SIZE - 16) + sub sp, sp, #48 + ldr w6, [x0, #CNT] + ldr x7, [x0, #DIF] +1: + and v7.8b, v3.8b, v31.8b // rng & 0x7f00 + sqdmulh v6.4h, v17.4h, v7.4h // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1 + add v4.4h, v17.4h, v29.4h // v = cdf + EC_MIN_PROB * (n_symbols - ret) + add v4.4h, v6.4h, v4.4h // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret) + str h3, [sp, #14] // store original u = s->rng + cmhs v2.8h, v1.8h, v4.8h // c >= v + str q4, [sp, #16] // store v values to allow indexed access + and v6.16b, v2.16b, v16.16b // One bit per halfword set in the mask + addv h6, v6.8h // Aggregate mask bits + umov w3, v6.h[0] + add w13, w13, #5 + rbit w3, w3 + add x8, sp, #16 + clz w15, w3 // ret + + cbz w10, 2f + // update_cdf + movi v5.8b, #0xff + mov w4, #-5 + urhadd v4.4h, v5.4h, v2.4h // i >= val ? -1 : 32768 + sub w4, w4, w9, lsr #4 // -((count >> 4) + 5) + sub v4.4h, v4.4h, v0.4h // (32768 - cdf[i]) or (-1 - cdf[i]) + dup v6.4h, w4 // -rate + + sub w9, w9, w9, lsr #5 // count - (count == 32) + sub v0.4h, v0.4h, v2.4h // cdf + (i >= val ? 1 : 0) + sshl v4.4h, v4.4h, v6.4h // ({32768,-1} - cdf[i]) >> rate + add w9, w9, #1 // count + (count < 32) + add v0.4h, v0.4h, v4.4h // cdf + (32768 - cdf[i]) >> rate + st1 {v0.4h}, [x1] + and v17.8b, v0.8b, v30.8b // cdf & 0xffc0 + strh w9, [x1, #6] + +2: + add x8, x8, w15, uxtw #1 + ldrh w3, [x8] // v + ldurh w4, [x8, #-2] // u + sub w4, w4, w3 // rng = u - v + clz w5, w4 // clz(rng) + eor w5, w5, #16 // d = clz(rng) ^ 16 + mvn x7, x7 // ~dif + add x7, x7, x3, lsl #48 // ~dif + (v << 48) + lsl w4, w4, w5 // rng << d + subs w6, w6, w5 // cnt -= d + lsl x7, x7, x5 // (~dif + (v << 48)) << d + str w4, [x0, #RNG] + dup v3.4h, w4 + mvn x7, x7 // ~dif + b.hs 9f + + // refill + ldp x3, x4, [x0] // BUF_POS, BUF_END + add x5, x3, #8 + cmp x5, x4 + b.gt 2f + + ldr x3, [x3] // next_bits + add w8, w6, #23 // shift_bits = cnt + 23 + add w6, w6, #16 // cnt += 16 + rev x3, x3 // next_bits = bswap(next_bits) + sub x5, x5, x8, lsr #3 // buf_pos -= shift_bits >> 3 + and w8, w8, #24 // shift_bits &= 24 + lsr x3, x3, x8 // next_bits >>= shift_bits + sub w8, w8, w6 // shift_bits -= 16 + cnt + str x5, [x0, #BUF_POS] + lsl x3, x3, x8 // next_bits <<= shift_bits + mov w4, #48 + sub w6, w4, w8 // cnt = cnt + 64 - shift_bits + eor x7, x7, x3 // dif ^= next_bits + b 9f + +2: // refill_eob + mov w14, #40 + sub w5, w14, w6 // c = 40 - cnt +3: + cmp x3, x4 + b.ge 4f + ldrb w8, [x3], #1 + lsl x8, x8, x5 + eor x7, x7, x8 + subs w5, w5, #8 + b.ge 3b + +4: // refill_eob_end + str x3, [x0, #BUF_POS] + sub w6, w14, w5 // cnt = 40 - c + +9: + lsl w15, w15, #1 + sub w15, w15, #5 + lsr x12, x7, #48 + adds w13, w13, w15 // carry = tok_br < 3 || tok == 15 + dup v1.8h, w12 + b.cc 1b // loop if !carry + add w13, w13, #30 + str w6, [x0, #CNT] + add sp, sp, #48 + str x7, [x0, #DIF] + lsr w0, w13, #1 + ret +endfunc + function msac_decode_bool_equi_neon, export=1 ldp w5, w6, [x0, #RNG] // + CNT sub sp, sp, #48 diff --git a/ffmpeg/JNI/dav1d/src/arm/64/util.S b/ffmpeg/JNI/dav1d/src/arm/64/util.S index 3332c8522..fc0e0d04f 100644 --- a/ffmpeg/JNI/dav1d/src/arm/64/util.S +++ b/ffmpeg/JNI/dav1d/src/arm/64/util.S @@ -170,6 +170,18 @@ trn2 \r3\().2s, \t5\().2s, \t7\().2s .endm +.macro transpose_4x4s r0, r1, r2, r3, t4, t5, t6, t7 + trn1 \t4\().4s, \r0\().4s, \r1\().4s + trn2 \t5\().4s, \r0\().4s, \r1\().4s + trn1 \t6\().4s, \r2\().4s, \r3\().4s + trn2 \t7\().4s, \r2\().4s, \r3\().4s + + trn1 \r0\().2d, \t4\().2d, \t6\().2d + trn2 \r2\().2d, \t4\().2d, \t6\().2d + trn1 \r1\().2d, \t5\().2d, \t7\().2d + trn2 \r3\().2d, \t5\().2d, \t7\().2d +.endm + .macro transpose_4x8h r0, r1, r2, r3, t4, t5, t6, t7 trn1 \t4\().8h, \r0\().8h, \r1\().8h trn2 \t5\().8h, \r0\().8h, \r1\().8h diff --git a/ffmpeg/JNI/dav1d/src/arm/asm.S b/ffmpeg/JNI/dav1d/src/arm/asm.S index 6b1d46fcd..1cd0955d4 100644 --- a/ffmpeg/JNI/dav1d/src/arm/asm.S +++ b/ffmpeg/JNI/dav1d/src/arm/asm.S @@ -93,6 +93,7 @@ .global EXTERN\name #ifdef __ELF__ .type EXTERN\name, %function + .hidden EXTERN\name #endif #if HAVE_AS_FUNC .func EXTERN\name @@ -109,7 +110,7 @@ EXTERN\name: \name: .endm -.macro const name, align=2 +.macro const name, export=0, align=2 .macro endconst #ifdef __ELF__ .size \name, . - \name @@ -124,6 +125,13 @@ EXTERN\name: .const_data #endif .align \align + .if \export + .global EXTERN\name +#ifdef __ELF__ + .hidden EXTERN\name +#endif +EXTERN\name: + .endif \name: .endm @@ -135,4 +143,9 @@ EXTERN\name: #define X(x) CONCAT(EXTERN, x) +#if ARCH_AARCH64 +#define x18 do_not_use_x18 +#define w18 do_not_use_w18 +#endif + #endif /* DAV1D_SRC_ARM_ASM_S */ diff --git a/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c index 5b3eb07b4..e42ceaf1f 100644 --- a/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c @@ -27,56 +27,56 @@ #include "src/cpu.h" #include "src/ipred.h" -decl_angular_ipred_fn(dav1d_ipred_dc_neon); -decl_angular_ipred_fn(dav1d_ipred_dc_128_neon); -decl_angular_ipred_fn(dav1d_ipred_dc_top_neon); -decl_angular_ipred_fn(dav1d_ipred_dc_left_neon); -decl_angular_ipred_fn(dav1d_ipred_h_neon); -decl_angular_ipred_fn(dav1d_ipred_v_neon); -decl_angular_ipred_fn(dav1d_ipred_paeth_neon); -decl_angular_ipred_fn(dav1d_ipred_smooth_neon); -decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon); -decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon); -decl_angular_ipred_fn(dav1d_ipred_filter_neon); +decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); +decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); -decl_cfl_pred_fn(dav1d_ipred_cfl_neon); -decl_cfl_pred_fn(dav1d_ipred_cfl_128_neon); -decl_cfl_pred_fn(dav1d_ipred_cfl_top_neon); -decl_cfl_pred_fn(dav1d_ipred_cfl_left_neon); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); +decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_neon); -decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_neon); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); +decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); -decl_pal_pred_fn(dav1d_pal_pred_neon); +decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 - c->intra_pred[DC_PRED] = dav1d_ipred_dc_neon; - c->intra_pred[DC_128_PRED] = dav1d_ipred_dc_128_neon; - c->intra_pred[TOP_DC_PRED] = dav1d_ipred_dc_top_neon; - c->intra_pred[LEFT_DC_PRED] = dav1d_ipred_dc_left_neon; - c->intra_pred[HOR_PRED] = dav1d_ipred_h_neon; - c->intra_pred[VERT_PRED] = dav1d_ipred_v_neon; +#if BITDEPTH == 8 || ARCH_AARCH64 + c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); + c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); + c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); + c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); + c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); + c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); #if ARCH_AARCH64 - c->intra_pred[PAETH_PRED] = dav1d_ipred_paeth_neon; - c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_neon; - c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon; - c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon; - c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_neon; + c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); + c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); + c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); + c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); + c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); - c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_neon; - c->cfl_pred[DC_128_PRED] = dav1d_ipred_cfl_128_neon; - c->cfl_pred[TOP_DC_PRED] = dav1d_ipred_cfl_top_neon; - c->cfl_pred[LEFT_DC_PRED] = dav1d_ipred_cfl_left_neon; + c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); + c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); + c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); + c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_neon; - c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); - c->pal_pred = dav1d_pal_pred_neon; + c->pal_pred = BF(dav1d_pal_pred, neon); #endif #endif } diff --git a/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c index f9c68e9eb..ad418f2db 100644 --- a/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c @@ -29,32 +29,32 @@ #include "src/itx.h" #define decl_itx2_fns(w, h, opt) \ -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt)) #define decl_itx12_fns(w, h, opt) \ decl_itx2_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt)) #define decl_itx16_fns(w, h, opt) \ decl_itx12_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \ -decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \ +decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt)) #define decl_itx17_fns(w, h, opt) \ decl_itx16_fns(w, h, opt); \ -decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt) +decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt)) decl_itx17_fns( 4, 4, neon); decl_itx16_fns( 4, 8, neon); @@ -71,16 +71,16 @@ decl_itx2_fns (32, 8, neon); decl_itx2_fns (32, 16, neon); decl_itx2_fns (32, 32, neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_neon); -decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_neon); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon)); +decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon)); -COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \ c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \ - dav1d_inv_txfm_add_##type##_##w##x##h##_##ext + BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext) #define assign_itx1_fn(pfx, w, h, ext) \ assign_itx_fn(pfx, w, h, dct_dct, DCT_DCT, ext) @@ -117,7 +117,9 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; -#if BITDEPTH == 8 && ARCH_AARCH64 + if (bpc > 10) return; + +#if ARCH_AARCH64 || BITDEPTH == 8 assign_itx17_fn( , 4, 4, neon); assign_itx16_fn(R, 4, 8, neon); assign_itx16_fn(R, 4, 16, neon); diff --git a/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c index b17b78125..399ad41a4 100644 --- a/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c @@ -66,6 +66,8 @@ decl_w_mask_fn(BF(dav1d_w_mask_420, neon)); decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon)); decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon)); +decl_emu_edge_fn(BF(dav1d_emu_edge, neon)); + void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = BF(dav1d_put_##name, suffix) @@ -109,5 +111,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) { c->w_mask[2] = BF(dav1d_w_mask_420, neon); c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); + c->emu_edge = BF(dav1d_emu_edge, neon); #endif } diff --git a/ffmpeg/JNI/dav1d/src/arm/msac.h b/ffmpeg/JNI/dav1d/src/arm/msac.h index a243a0629..9db0bf86a 100644 --- a/ffmpeg/JNI/dav1d/src/arm/msac.h +++ b/ffmpeg/JNI/dav1d/src/arm/msac.h @@ -34,14 +34,16 @@ unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf, size_t n_symbols); unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf, size_t n_symbols); +unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf); unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s); unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f); -#if ARCH_AARCH64 +#if ARCH_AARCH64 || defined(__ARM_NEON) #define dav1d_msac_decode_symbol_adapt4 dav1d_msac_decode_symbol_adapt4_neon #define dav1d_msac_decode_symbol_adapt8 dav1d_msac_decode_symbol_adapt8_neon #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon +#define dav1d_msac_decode_hi_tok dav1d_msac_decode_hi_tok_neon #define dav1d_msac_decode_bool_adapt dav1d_msac_decode_bool_adapt_neon #define dav1d_msac_decode_bool_equi dav1d_msac_decode_bool_equi_neon #define dav1d_msac_decode_bool dav1d_msac_decode_bool_neon diff --git a/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c b/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c index 8ab9738ec..c45c7109d 100644 --- a/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c @@ -111,6 +111,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout; const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 }, + { 7, 0, 2, 4, 5, 6, 6, 6 } }; + const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422]; for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) { const int tf = f->lf.top_pre_cdef_toggle; @@ -199,8 +202,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f, damping, edges HIGHBD_CALL_SUFFIX); if (uv_lvl) { assert(layout != DAV1D_PIXEL_LAYOUT_I400); - const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ? - ((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0; + const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0; for (int pl = 1; pl <= 2; pl++) { dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl], &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor], diff --git a/ffmpeg/JNI/dav1d/src/cpu.c b/ffmpeg/JNI/dav1d/src/cpu.c index 35816822c..f8a909f28 100644 --- a/ffmpeg/JNI/dav1d/src/cpu.c +++ b/ffmpeg/JNI/dav1d/src/cpu.c @@ -31,7 +31,11 @@ #include "src/cpu.h" static unsigned flags = 0; -#if ARCH_X86 + +#if __has_feature(memory_sanitizer) +// memory sanitizer is inherently incompatible with asm +static unsigned flags_mask = 0; +#elif ARCH_X86 /* Disable AVX-512 by default for the time being */ static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL; #else diff --git a/ffmpeg/JNI/dav1d/src/decode.c b/ffmpeg/JNI/dav1d/src/decode.c index 9fb157166..f6782153c 100644 --- a/ffmpeg/JNI/dav1d/src/decode.c +++ b/ffmpeg/JNI/dav1d/src/decode.c @@ -1998,7 +1998,6 @@ static int decode_b(Dav1dTileContext *const t, return 0; } -#if defined(__has_feature) #if __has_feature(memory_sanitizer) #include @@ -2051,7 +2050,6 @@ static int checked_decode_b(Dav1dTileContext *const t, #define decode_b checked_decode_b #endif /* defined(__has_feature) */ -#endif /* __has_feature(memory_sanitizer) */ static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl, const EdgeNode *const node) @@ -3304,7 +3302,7 @@ int dav1d_submit_frame(Dav1dContext *const c) { #define assign_bitdepth_case(bd) \ dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \ dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \ - dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \ + dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \ dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \ dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \ dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \ diff --git a/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm b/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm index a6a8fb7c6..c252e5451 100644 --- a/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm +++ b/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm @@ -358,7 +358,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) %define high_mm_regs (16*cpuflag(avx512)) -%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) +%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 %assign %%pad 0 @@ -403,7 +403,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endif %endmacro -%macro SETUP_STACK_POINTER 1 +%macro SETUP_STACK_POINTER 0-1 0 %ifnum %1 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 diff --git a/ffmpeg/JNI/dav1d/src/ipred_tmpl.c b/ffmpeg/JNI/dav1d/src/ipred_tmpl.c index ef076f657..50c7a3c7b 100644 --- a/ffmpeg/JNI/dav1d/src/ipred_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/ipred_tmpl.c @@ -133,7 +133,7 @@ static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride, const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX) { - unsigned dc = dc_gen_left(topleft, height); + const unsigned dc = dc_gen_left(topleft, height); cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX); } @@ -625,16 +625,12 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride, assert(filt_idx < 5); const int8_t *const filter = dav1d_filter_intra_taps[filt_idx]; - int x, y; - ptrdiff_t left_stride; - const pixel *left, *topleft, *top; - - top = &topleft_in[1]; - for (y = 0; y < height; y += 2) { - topleft = &topleft_in[-y]; - left = &topleft[-1]; - left_stride = -1; - for (x = 0; x < width; x += 4) { + const pixel *top = &topleft_in[1]; + for (int y = 0; y < height; y += 2) { + const pixel *topleft = &topleft_in[-y]; + const pixel *left = &topleft[-1]; + ptrdiff_t left_stride = -1; + for (int x = 0; x < width; x += 4) { const int p0 = *topleft; const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3]; const int p5 = left[0 * left_stride], p6 = left[1 * left_stride]; @@ -643,7 +639,7 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride, for (int yy = 0; yy < 2; yy++) { for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) { - int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6); + const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6); ptr[xx] = iclip_pixel((acc + 8) >> 4); } ptr += PXSTRIDE(stride); diff --git a/ffmpeg/JNI/dav1d/src/itx.h b/ffmpeg/JNI/dav1d/src/itx.h index 3befc4209..a299629c5 100644 --- a/ffmpeg/JNI/dav1d/src/itx.h +++ b/ffmpeg/JNI/dav1d/src/itx.h @@ -43,8 +43,8 @@ typedef struct Dav1dInvTxfmDSPContext { itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL]; } Dav1dInvTxfmDSPContext; -bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c); -bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c); +bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc); +bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc); bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c); #endif /* DAV1D_SRC_ITX_H */ diff --git a/ffmpeg/JNI/dav1d/src/itx_1d.c b/ffmpeg/JNI/dav1d/src/itx_1d.c index 87687007d..ca14fc8c4 100644 --- a/ffmpeg/JNI/dav1d/src/itx_1d.c +++ b/ffmpeg/JNI/dav1d/src/itx_1d.c @@ -119,13 +119,13 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride, t7a = ((in1 * (4017 - 4096) + in7 * 799 + 2048) >> 12) + in1; } - int t4 = CLIP(t4a + t5a); - t5a = CLIP(t4a - t5a); - int t7 = CLIP(t7a + t6a); - t6a = CLIP(t7a - t6a); + const int t4 = CLIP(t4a + t5a); + t5a = CLIP(t4a - t5a); + const int t7 = CLIP(t7a + t6a); + t6a = CLIP(t7a - t6a); - int t5 = ((t6a - t5a) * 181 + 128) >> 8; - int t6 = ((t6a + t5a) * 181 + 128) >> 8; + const int t5 = ((t6a - t5a) * 181 + 128) >> 8; + const int t6 = ((t6a + t5a) * 181 + 128) >> 8; const int t0 = c[0 * stride]; const int t1 = c[2 * stride]; @@ -812,23 +812,23 @@ inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s, const int in4 = in[4 * in_s], in5 = in[5 * in_s]; const int in6 = in[6 * in_s], in7 = in[7 * in_s]; - int t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7; - int t1a = (( 401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0; - int t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5; - int t3a = (( 1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2; - int t4a = ( 1299 * in3 + 1583 * in4 + 1024) >> 11; - int t5a = ( 1583 * in3 - 1299 * in4 + 1024) >> 11; - int t6a = (( 1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6; - int t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1; - - int t0 = CLIP(t0a + t4a); - int t1 = CLIP(t1a + t5a); - int t2 = CLIP(t2a + t6a); - int t3 = CLIP(t3a + t7a); - int t4 = CLIP(t0a - t4a); - int t5 = CLIP(t1a - t5a); - int t6 = CLIP(t2a - t6a); - int t7 = CLIP(t3a - t7a); + const int t0a = (((4076 - 4096) * in7 + 401 * in0 + 2048) >> 12) + in7; + const int t1a = (( 401 * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0; + const int t2a = (((3612 - 4096) * in5 + 1931 * in2 + 2048) >> 12) + in5; + const int t3a = (( 1931 * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2; + int t4a = ( 1299 * in3 + 1583 * in4 + 1024) >> 11; + int t5a = ( 1583 * in3 - 1299 * in4 + 1024) >> 11; + int t6a = (( 1189 * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6; + int t7a = (((3920 - 4096) * in1 - 1189 * in6 + 2048) >> 12) + in1; + + const int t0 = CLIP(t0a + t4a); + const int t1 = CLIP(t1a + t5a); + int t2 = CLIP(t2a + t6a); + int t3 = CLIP(t3a + t7a); + const int t4 = CLIP(t0a - t4a); + const int t5 = CLIP(t1a - t5a); + int t6 = CLIP(t2a - t6a); + int t7 = CLIP(t3a - t7a); t4a = (((3784 - 4096) * t4 + 1567 * t5 + 2048) >> 12) + t4; t5a = (( 1567 * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5; diff --git a/ffmpeg/JNI/dav1d/src/itx_tmpl.c b/ffmpeg/JNI/dav1d/src/itx_tmpl.c index 02f34e85c..a0e807f95 100644 --- a/ffmpeg/JNI/dav1d/src/itx_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/itx_tmpl.c @@ -180,7 +180,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride, dst[x] = iclip_pixel(dst[x] + *c++); } -COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { +COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) { #define assign_itx_all_fn64(w, h, pfx) \ c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT ] = \ inv_txfm_add_dct_dct_##w##x##h##_c @@ -224,8 +224,6 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \ inv_txfm_add_identity_adst_##w##x##h##_c; \ - memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */ - c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c; assign_itx_all_fn84( 4, 4, ); assign_itx_all_fn84( 4, 8, R); @@ -249,7 +247,7 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) { #if HAVE_ASM #if ARCH_AARCH64 || ARCH_ARM - bitfn(dav1d_itx_dsp_init_arm)(c); + bitfn(dav1d_itx_dsp_init_arm)(c, bpc); #endif #if ARCH_X86 bitfn(dav1d_itx_dsp_init_x86)(c); diff --git a/ffmpeg/JNI/dav1d/src/lib.c b/ffmpeg/JNI/dav1d/src/lib.c index cda2e9df0..82af64a53 100644 --- a/ffmpeg/JNI/dav1d/src/lib.c +++ b/ffmpeg/JNI/dav1d/src/lib.c @@ -31,7 +31,7 @@ #include #include -#ifdef __linux__ +#if defined(__linux__) && defined(HAVE_DLSYM) #include #endif @@ -81,7 +81,7 @@ static void close_internal(Dav1dContext **const c_out, int flush); NO_SANITIZE("cfi-icall") // CFI is broken with dlsym() static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) { -#if defined(__linux__) && defined(HAVE_DLSYM) +#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__) /* glibc has an issue where the size of the TLS is subtracted from the stack * size instead of allocated separately. As a result the specified stack * size may be insufficient when used in an application with large amounts diff --git a/ffmpeg/JNI/dav1d/src/log.c b/ffmpeg/JNI/dav1d/src/log.c index 999e3a2e8..de6776a61 100644 --- a/ffmpeg/JNI/dav1d/src/log.c +++ b/ffmpeg/JNI/dav1d/src/log.c @@ -36,13 +36,13 @@ #include "src/internal.h" #include "src/log.h" +#if CONFIG_LOG COLD void dav1d_log_default_callback(void *const cookie, const char *const format, va_list ap) { vfprintf(stderr, format, ap); } -#if CONFIG_LOG COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) { validate_input(c != NULL); diff --git a/ffmpeg/JNI/dav1d/src/log.h b/ffmpeg/JNI/dav1d/src/log.h index 8f6357cb6..df32de7f2 100644 --- a/ffmpeg/JNI/dav1d/src/log.h +++ b/ffmpeg/JNI/dav1d/src/log.h @@ -35,12 +35,12 @@ #include "common/attributes.h" -void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); - #if CONFIG_LOG #define dav1d_log dav1d_log +void dav1d_log_default_callback(void *cookie, const char *format, va_list ap); void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3); #else +#define dav1d_log_default_callback NULL #define dav1d_log(...) do { } while(0) #endif diff --git a/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c b/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c index 62eee81ed..02413b913 100644 --- a/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c @@ -73,11 +73,11 @@ static void backup_lpf(const Dav1dFrameContext *const f, dst += 4 * PXSTRIDE(dst_stride); src += (stripe_h - 2) * PXSTRIDE(src_stride); - if (f->frame_hdr->super_res.enabled) { + if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) { while (row + stripe_h <= row_h) { const int n_lines = 4 - (row + stripe_h + 1 == h); f->dsp->mc.resize(dst, dst_stride, src, src_stride, - dst_w, src_w, n_lines, f->resize_step[ss_hor], + dst_w, n_lines, src_w, f->resize_step[ss_hor], f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX); row += stripe_h; // unmodified stripe_h for the 1st stripe stripe_h = 64 >> ss_ver; diff --git a/ffmpeg/JNI/dav1d/src/mc.h b/ffmpeg/JNI/dav1d/src/mc.h index 33baea6b2..784b58d22 100644 --- a/ffmpeg/JNI/dav1d/src/mc.h +++ b/ffmpeg/JNI/dav1d/src/mc.h @@ -110,7 +110,7 @@ typedef decl_emu_edge_fn(*emu_edge_fn); #define decl_resize_fn(name) \ void (name)(pixel *dst, ptrdiff_t dst_stride, \ const pixel *src, ptrdiff_t src_stride, \ - int dst_w, int src_w, int h, int dx, int mx HIGHBD_DECL_SUFFIX) + int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX) typedef decl_resize_fn(*resize_fn); typedef struct Dav1dMCDSPContext { diff --git a/ffmpeg/JNI/dav1d/src/mc_tmpl.c b/ffmpeg/JNI/dav1d/src/mc_tmpl.c index 20bef0d7f..c4d9e14eb 100644 --- a/ffmpeg/JNI/dav1d/src/mc_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/mc_tmpl.c @@ -885,21 +885,21 @@ static void emu_edge_c(const intptr_t bw, const intptr_t bh, static void resize_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *src, const ptrdiff_t src_stride, - const int dst_w, const int src_w, int h, + const int dst_w, int h, const int src_w, const int dx, const int mx0 HIGHBD_DECL_SUFFIX) { do { int mx = mx0, src_x = -1; for (int x = 0; x < dst_w; x++) { - const int16_t *const F = dav1d_resize_filter[mx >> 8]; - dst[x] = iclip_pixel((F[0] * src[iclip(src_x - 3, 0, src_w - 1)] + - F[1] * src[iclip(src_x - 2, 0, src_w - 1)] + - F[2] * src[iclip(src_x - 1, 0, src_w - 1)] + - F[3] * src[iclip(src_x + 0, 0, src_w - 1)] + - F[4] * src[iclip(src_x + 1, 0, src_w - 1)] + - F[5] * src[iclip(src_x + 2, 0, src_w - 1)] + - F[6] * src[iclip(src_x + 3, 0, src_w - 1)] + - F[7] * src[iclip(src_x + 4, 0, src_w - 1)] + + const int8_t *const F = dav1d_resize_filter[mx >> 8]; + dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] + + F[1] * src[iclip(src_x - 2, 0, src_w - 1)] + + F[2] * src[iclip(src_x - 1, 0, src_w - 1)] + + F[3] * src[iclip(src_x + 0, 0, src_w - 1)] + + F[4] * src[iclip(src_x + 1, 0, src_w - 1)] + + F[5] * src[iclip(src_x + 2, 0, src_w - 1)] + + F[6] * src[iclip(src_x + 3, 0, src_w - 1)] + + F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) + 64) >> 7); mx += dx; src_x += mx >> 14; diff --git a/ffmpeg/JNI/dav1d/src/meson.build b/ffmpeg/JNI/dav1d/src/meson.build index d4df49308..fd8ad0269 100644 --- a/ffmpeg/JNI/dav1d/src/meson.build +++ b/ffmpeg/JNI/dav1d/src/meson.build @@ -102,6 +102,8 @@ if is_asm_enabled ) if host_machine.cpu_family() == 'aarch64' libdav1d_sources += files( + # itx.S is used for both 8 and 16 bpc. + 'arm/64/itx.S', 'arm/64/looprestoration_common.S', 'arm/64/msac.S', ) @@ -110,7 +112,6 @@ if is_asm_enabled libdav1d_sources += files( 'arm/64/cdef.S', 'arm/64/ipred.S', - 'arm/64/itx.S', 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', 'arm/64/mc.S', @@ -120,6 +121,8 @@ if is_asm_enabled if dav1d_bitdepths.contains('16') libdav1d_sources += files( 'arm/64/cdef16.S', + 'arm/64/ipred16.S', + 'arm/64/itx16.S', 'arm/64/loopfilter16.S', 'arm/64/looprestoration16.S', 'arm/64/mc16.S', @@ -127,12 +130,14 @@ if is_asm_enabled endif elif host_machine.cpu_family().startswith('arm') libdav1d_sources += files( + 'arm/32/msac.S', ) if dav1d_bitdepths.contains('8') libdav1d_sources += files( 'arm/32/cdef.S', 'arm/32/ipred.S', + 'arm/32/itx.S', 'arm/32/loopfilter.S', 'arm/32/looprestoration.S', 'arm/32/mc.S', @@ -148,14 +153,9 @@ if is_asm_enabled libdav1d_sources += files( 'x86/cpu.c', + 'x86/msac_init.c', ) - if host_machine.cpu_family() == 'x86_64' - libdav1d_sources += files( - 'x86/msac_init.c', - ) - endif - libdav1d_tmpl_sources += files( 'x86/cdef_init_tmpl.c', 'x86/film_grain_init_tmpl.c', @@ -174,7 +174,8 @@ if is_asm_enabled if dav1d_bitdepths.contains('8') libdav1d_sources_asm += files( - 'x86/cdef.asm', + 'x86/cdef_avx512.asm', + 'x86/cdef_avx2.asm', 'x86/film_grain.asm', 'x86/ipred.asm', 'x86/itx.asm', @@ -187,7 +188,7 @@ if is_asm_enabled 'x86/itx_ssse3.asm', 'x86/loopfilter_ssse3.asm', 'x86/looprestoration_ssse3.asm', - 'x86/mc_ssse3.asm', + 'x86/mc_sse.asm', ) endif diff --git a/ffmpeg/JNI/dav1d/src/msac.c b/ffmpeg/JNI/dav1d/src/msac.c index 0a0ef04a1..8195977d5 100644 --- a/ffmpeg/JNI/dav1d/src/msac.c +++ b/ffmpeg/JNI/dav1d/src/msac.c @@ -38,7 +38,7 @@ #define EC_WIN_SIZE (sizeof(ec_win) << 3) -static inline void ctx_refill(MsacContext *s) { +static inline void ctx_refill(MsacContext *const s) { const uint8_t *buf_pos = s->buf_pos; const uint8_t *buf_end = s->buf_end; int c = EC_WIN_SIZE - s->cnt - 24; @@ -57,7 +57,9 @@ static inline void ctx_refill(MsacContext *s) { * necessary), and stores them back in the decoder context. * dif: The new value of dif. * rng: The new value of the range. */ -static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) { +static inline void ctx_norm(MsacContext *const s, const ec_win dif, + const unsigned rng) +{ const int d = 15 ^ (31 ^ clz(rng)); assert(rng <= 65535U); s->cnt -= d; @@ -68,16 +70,16 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) { } unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) { - ec_win vw, dif = s->dif; - unsigned ret, v, r = s->rng; + const unsigned r = s->rng; + ec_win dif = s->dif; assert((dif >> (EC_WIN_SIZE - 16)) < r); // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can // replace the multiply with a simple shift. - v = ((r >> 8) << 7) + EC_MIN_PROB; - vw = (ec_win)v << (EC_WIN_SIZE - 16); - ret = dif >= vw; - dif -= ret*vw; - v += ret*(r - 2*v); + unsigned v = ((r >> 8) << 7) + EC_MIN_PROB; + const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16); + const unsigned ret = dif >= vw; + dif -= ret * vw; + v += ret * (r - 2 * v); ctx_norm(s, dif, v); return !ret; } @@ -86,14 +88,14 @@ unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) { * f: The probability that the bit is one * Return: The value decoded (0 or 1). */ unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) { - ec_win vw, dif = s->dif; - unsigned ret, v, r = s->rng; + const unsigned r = s->rng; + ec_win dif = s->dif; assert((dif >> (EC_WIN_SIZE - 16)) < r); - v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; - vw = (ec_win)v << (EC_WIN_SIZE - 16); - ret = dif >= vw; - dif -= ret*vw; - v += ret*(r - 2*v); + unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB; + const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16); + const unsigned ret = dif >= vw; + dif -= ret * vw; + v += ret * (r - 2 * v); ctx_norm(s, dif, v); return !ret; } @@ -196,12 +198,11 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data, s->rng = 0x8000; s->cnt = -15; s->allow_update_cdf = !disable_cdf_update_flag; + ctx_refill(s); #if ARCH_X86_64 && HAVE_ASM s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c; dav1d_msac_init_x86(s); #endif - - ctx_refill(s); } diff --git a/ffmpeg/JNI/dav1d/src/obu.c b/ffmpeg/JNI/dav1d/src/obu.c index 4406f4bc2..ab9688c25 100644 --- a/ffmpeg/JNI/dav1d/src/obu.c +++ b/ffmpeg/JNI/dav1d/src/obu.c @@ -85,7 +85,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, hdr->time_scale = dav1d_get_bits(gb, 32); hdr->equal_picture_interval = dav1d_get_bits(gb, 1); if (hdr->equal_picture_interval) { - unsigned num_ticks_per_picture = dav1d_get_vlc(gb); + const unsigned num_ticks_per_picture = dav1d_get_vlc(gb); if (num_ticks_per_picture == 0xFFFFFFFFU) goto error; hdr->num_ticks_per_picture = num_ticks_per_picture + 1; @@ -111,8 +111,6 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, for (int i = 0; i < hdr->num_operating_points; i++) { struct Dav1dSequenceHeaderOperatingPoint *const op = &hdr->operating_points[i]; - struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = - &hdr->operating_parameter_info[i]; op->idc = dav1d_get_bits(gb, 12); op->major_level = 2 + dav1d_get_bits(gb, 3); op->minor_level = dav1d_get_bits(gb, 2); @@ -120,6 +118,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, op->decoder_model_param_present = hdr->decoder_model_info_present && dav1d_get_bits(gb, 1); if (op->decoder_model_param_present) { + struct Dav1dSequenceHeaderOperatingParameterInfo *const opi = + &hdr->operating_parameter_info[i]; opi->decoder_buffer_delay = dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length); opi->encoder_buffer_delay = @@ -132,10 +132,9 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb, op->initial_display_delay = dav1d_get_bits(gb, 4) + 1; } } - if (c->operating_point < hdr->num_operating_points) - c->operating_point_idc = hdr->operating_points[c->operating_point].idc; - else - c->operating_point_idc = hdr->operating_points[0].idc; + const int op_idx = + c->operating_point < hdr->num_operating_points ? c->operating_point : 0; + c->operating_point_idc = hdr->operating_points[op_idx].idc; #if DEBUG_SEQ_HDR printf("SEQHDR: post-operating-points: off=%ld\n", dav1d_get_bits_pos(gb) - init_bit_pos); @@ -295,7 +294,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, if (use_ref) { for (int i = 0; i < 7; i++) { if (dav1d_get_bits(gb, 1)) { - Dav1dThreadPicture *const ref = + const Dav1dThreadPicture *const ref = &c->refs[c->frame_hdr->refidx[i]].p; if (!ref->p.data[0]) return -1; hdr->width[1] = ref->p.p.w; @@ -343,7 +342,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb, return 0; } -static inline int tile_log2(int sz, int tgt) { +static inline int tile_log2(const int sz, const int tgt) { int k; for (k = 0; (sz << k) < tgt; k++) ; return k; @@ -362,7 +361,6 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { #endif const Dav1dSequenceHeader *const seqhdr = c->seq_hdr; Dav1dFrameHeader *const hdr = c->frame_hdr; - int res; hdr->show_existing_frame = !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1); @@ -444,7 +442,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint) for (int i = 0; i < 8; i++) dav1d_get_bits(gb, seqhdr->order_hint_n_bits); - if ((res = read_frame_size(c, gb, 0)) < 0) goto error; + if (read_frame_size(c, gb, 0) < 0) goto error; hdr->allow_intrabc = hdr->allow_screen_content_tools && !hdr->super_res.enabled && dav1d_get_bits(gb, 1); hdr->use_ref_frame_mvs = 0; @@ -479,7 +477,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { int latest_frame_offset = -1; for (int i = 0; i < 8; i++) { - int hint = shifted_frame_offset[i]; + const int hint = shifted_frame_offset[i]; if (!used_frame[i] && hint >= current_frame_offset && hint >= latest_frame_offset) { @@ -492,7 +490,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { int earliest_frame_offset = INT_MAX; for (int i = 0; i < 8; i++) { - int hint = shifted_frame_offset[i]; + const int hint = shifted_frame_offset[i]; if (!used_frame[i] && hint >= current_frame_offset && hint < earliest_frame_offset) { @@ -505,7 +503,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { earliest_frame_offset = INT_MAX; for (int i = 0; i < 8; i++) { - int hint = shifted_frame_offset[i]; + const int hint = shifted_frame_offset[i]; if (!used_frame[i] && hint >= current_frame_offset && (hint < earliest_frame_offset)) { @@ -520,7 +518,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { if (hdr->refidx[i] < 0) { latest_frame_offset = -1; for (int j = 0; j < 8; j++) { - int hint = shifted_frame_offset[j]; + const int hint = shifted_frame_offset[j]; if (!used_frame[j] && hint < current_frame_offset && hint >= latest_frame_offset) { @@ -536,7 +534,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { earliest_frame_offset = INT_MAX; int ref = -1; for (int i = 0; i < 8; i++) { - int hint = shifted_frame_offset[i]; + const int hint = shifted_frame_offset[i]; if (hint < earliest_frame_offset) { ref = i; earliest_frame_offset = hint; @@ -555,7 +553,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } const int use_ref = !hdr->error_resilient_mode && hdr->frame_size_override; - if ((res = read_frame_size(c, gb, use_ref)) < 0) goto error; + if (read_frame_size(c, gb, use_ref) < 0) goto error; hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1); hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE : dav1d_get_bits(gb, 2); @@ -579,15 +577,15 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { // tile data hdr->tiling.uniform = dav1d_get_bits(gb, 1); const int sbsz_min1 = (64 << seqhdr->sb128) - 1; - int sbsz_log2 = 6 + seqhdr->sb128; - int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2; - int sbh = (hdr->height + sbsz_min1) >> sbsz_log2; - int max_tile_width_sb = 4096 >> sbsz_log2; - int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2); + const int sbsz_log2 = 6 + seqhdr->sb128; + const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2; + const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2; + const int max_tile_width_sb = 4096 >> sbsz_log2; + const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2); hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw); hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS)); hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS)); - int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh), + const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh), hdr->tiling.min_log2_cols); if (hdr->tiling.uniform) { for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols; @@ -621,7 +619,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols); if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1; - int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1); + const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1); hdr->tiling.rows = 0; for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) { @@ -657,7 +655,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { // If the sequence header says that delta_q might be different // for U, V, we must check whether it actually is for this // frame. - int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0; + const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0; hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0; if (diff_uv_delta) { @@ -1053,7 +1051,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { for (i = 0; i < 7; i++) if (hdr->refidx[i] == refidx) break; - if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error; + if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error; hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data; hdr->film_grain.data.seed = seed; } else { @@ -1133,10 +1131,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) { } static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { - int have_tile_pos = 0; const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows; - if (n_tiles > 1) - have_tile_pos = dav1d_get_bits(gb, 1); + const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0; if (have_tile_pos) { const int n_bits = c->frame_hdr->tiling.log2_cols + @@ -1151,9 +1147,9 @@ static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) { // Check that we haven't read more than obu_len bytes from the buffer // since init_bit_pos. -static int -check_for_overrun(Dav1dContext *const c, GetBits *const gb, - unsigned init_bit_pos, unsigned obu_len) +static int check_for_overrun(Dav1dContext *const c, GetBits *const gb, + const unsigned init_bit_pos, + const unsigned obu_len) { // Make sure we haven't actually read past the end of the gb buffer if (gb->error) { @@ -1161,7 +1157,7 @@ check_for_overrun(Dav1dContext *const c, GetBits *const gb, return 1; } - unsigned pos = dav1d_get_bits_pos(gb); + const unsigned pos = dav1d_get_bits_pos(gb); // We assume that init_bit_pos was the bit position of the buffer // at some point in the past, so cannot be smaller than pos. @@ -1175,7 +1171,7 @@ check_for_overrun(Dav1dContext *const c, GetBits *const gb, return 0; } -int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { +int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) { GetBits gb; int res; @@ -1196,11 +1192,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { } // obu length field - unsigned len = 0; - if (has_length_field) - len = dav1d_get_uleb128(&gb); - else - len = (int) in->sz - 1 - has_extension; + const unsigned len = has_length_field ? + dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension; if (gb.error) goto error; const unsigned init_bit_pos = dav1d_get_bits_pos(&gb); @@ -1442,7 +1435,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) { payload_size -= meta_type_len; int country_code_extension_byte = 0; - int country_code = dav1d_get_bits(&gb, 8); + const int country_code = dav1d_get_bits(&gb, 8); payload_size--; if (country_code == 0xFF) { country_code_extension_byte = dav1d_get_bits(&gb, 8); diff --git a/ffmpeg/JNI/dav1d/src/picture.c b/ffmpeg/JNI/dav1d/src/picture.c index 82197c34d..72af92e94 100644 --- a/ffmpeg/JNI/dav1d/src/picture.c +++ b/ffmpeg/JNI/dav1d/src/picture.c @@ -68,7 +68,7 @@ int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) { const size_t y_sz = y_stride * aligned_h; const size_t uv_sz = uv_stride * (aligned_h >> ss_ver); const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT; - uint8_t *data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT); + uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT); if (!data) return DAV1D_ERR(ENOMEM); p->data[0] = data; @@ -104,14 +104,16 @@ static void free_buffer(const uint8_t *const data, void *const user_data) { free(pic_ctx); } -static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p, +static int picture_alloc_with_edges(Dav1dContext *const c, + Dav1dPicture *const p, const int w, const int h, - Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref, - Dav1dFrameHeader *frame_hdr, Dav1dRef *frame_hdr_ref, - Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref, - Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref, - Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref, - const int bpc, const Dav1dDataProps *props, + Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref, + Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref, + Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref, + Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref, + Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref, + const int bpc, + const Dav1dDataProps *const props, Dav1dPicAllocator *const p_allocator, const size_t extra, void **const extra_ptr) { @@ -122,9 +124,8 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p assert(bpc > 0 && bpc <= 16); struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context)); - if (pic_ctx == NULL) { + if (pic_ctx == NULL) return DAV1D_ERR(ENOMEM); - } p->p.w = w; p->p.h = h; @@ -136,7 +137,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p p->p.layout = seq_hdr->layout; p->p.bpc = bpc; dav1d_data_props_set_defaults(&p->m); - int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); + const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie); if (res < 0) { free(pic_ctx); return res; @@ -250,8 +251,8 @@ void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) { memset(src, 0, sizeof(*src)); } -void dav1d_thread_picture_ref(Dav1dThreadPicture *dst, - const Dav1dThreadPicture *src) +void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst, + const Dav1dThreadPicture *const src) { dav1d_picture_ref(&dst->p, &src->p); dst->t = src->t; diff --git a/ffmpeg/JNI/dav1d/src/recon_tmpl.c b/ffmpeg/JNI/dav1d/src/recon_tmpl.c index 9feda96a6..8e96f8e16 100644 --- a/ffmpeg/JNI/dav1d/src/recon_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/recon_tmpl.c @@ -777,8 +777,8 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t, const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 }; for (int init_y = 0; init_y < h4; init_y += 16) { + const int sub_h4 = imin(h4, 16 + init_y); for (int init_x = 0; init_x < w4; init_x += 16) { - const int sub_h4 = imin(h4, 16 + init_y); const int sub_w4 = imin(w4, init_x + 16); int y_off = !!init_y, y, x; for (y = init_y, t->by += init_y; y < sub_h4; @@ -932,8 +932,8 @@ static int mc(Dav1dTileContext *const t, } else { assert(refp != &f->sr_cur); - int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); - int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); + const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver); + const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor); #define scale_mv(res, val, scale) do { \ const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \ res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32; \ @@ -1071,15 +1071,15 @@ static int warp_affine(Dav1dTileContext *const t, const int height = (refp->p.p.h + ss_ver) >> ss_ver; for (int y = 0; y < b_dim[1] * v_mul; y += 8) { + const int src_y = t->by * 4 + ((y + 4) << ss_ver); + const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0]; + const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1]; for (int x = 0; x < b_dim[0] * h_mul; x += 8) { // calculate transformation relative to center of 8x8 block in // luma pixel units const int src_x = t->bx * 4 + ((x + 4) << ss_hor); - const int src_y = t->by * 4 + ((y + 4) << ss_ver); - const int64_t mvx = ((int64_t) mat[2] * src_x + - (int64_t) mat[3] * src_y + mat[0]) >> ss_hor; - const int64_t mvy = ((int64_t) mat[4] * src_x + - (int64_t) mat[5] * src_y + mat[1]) >> ss_ver; + const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor; + const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver; const int dx = (int) (mvx >> 16) - 4; const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 - @@ -1147,6 +1147,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10; for (int init_y = 0; init_y < h4; init_y += 16) { + const int sub_h4 = imin(h4, 16 + init_y); + const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); for (int init_x = 0; init_x < w4; init_x += 16) { if (b->pal_sz[0]) { pixel *dst = ((pixel *) f->cur.data[0]) + @@ -1177,7 +1179,6 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 : intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM; int y, x; - const int sub_h4 = imin(h4, 16 + init_y); const int sub_w4 = imin(w4, init_x + 16); for (y = init_y, t->by += init_y; y < sub_h4; y += t_dim->h, t->by += t_dim->h) @@ -1345,8 +1346,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred"); } } else if (b->pal_sz[1]) { - ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + - (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); + const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) + + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1])); const uint16_t (*pal)[8]; const uint8_t *pal_idx; if (f->frame_thread.pass) { @@ -1384,7 +1385,6 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize const int uv_sb_has_bl = init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 : intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1)); - const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver); const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor); for (int pl = 0; pl < 2; pl++) { for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4; @@ -1520,7 +1520,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize } int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs, - const Av1Block *const b) + const Av1Block *const b) { Dav1dTileState *const ts = t->ts; const Dav1dFrameContext *const f = t->f; @@ -2013,9 +2013,10 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) { const int src_w = (4 * f->bw + ss_hor) >> ss_hor; const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver; - f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, src_w, - imin(img_h, h_end) + h_start, f->resize_step[!!pl], - f->resize_start[!!pl] HIGHBD_CALL_SUFFIX); + f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, + imin(img_h, h_end) + h_start, src_w, + f->resize_step[!!pl], f->resize_start[!!pl] + HIGHBD_CALL_SUFFIX); } } if (f->lf.restore_planes) { diff --git a/ffmpeg/JNI/dav1d/src/ref.c b/ffmpeg/JNI/dav1d/src/ref.c index 89b158047..32cc96f08 100644 --- a/ffmpeg/JNI/dav1d/src/ref.c +++ b/ffmpeg/JNI/dav1d/src/ref.c @@ -37,25 +37,21 @@ static void default_free_callback(const uint8_t *const data, void *const user_da } Dav1dRef *dav1d_ref_create(const size_t size) { - Dav1dRef *res; void *data = dav1d_alloc_aligned(size, 32); - if (!data) { - return NULL; - } + if (!data) return NULL; - res = dav1d_ref_wrap(data, default_free_callback, data); - if (!res) { - dav1d_free_aligned(data); - } else { + Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data); + if (res) res->data = data; - } + else + dav1d_free_aligned(data); return res; } Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr, void (*free_callback)(const uint8_t *data, void *user_data), - void *user_data) + void *const user_data) { Dav1dRef *res = malloc(sizeof(Dav1dRef)); if (!res) return NULL; diff --git a/ffmpeg/JNI/dav1d/src/refmvs.c b/ffmpeg/JNI/dav1d/src/refmvs.c index 2039bed4f..1e113b4ea 100644 --- a/ffmpeg/JNI/dav1d/src/refmvs.c +++ b/ffmpeg/JNI/dav1d/src/refmvs.c @@ -182,10 +182,13 @@ static inline union mv mv_projection(const union mv mv, const int num, const int }; assert(den > 0 && den < 32); assert(num > -32 && num < 32); - const int dm = div_mult[den]; - const int y = mv.y * num * dm, x = mv.x * num * dm; - return (union mv) { .y = (y + 8192 + (y >> 31)) >> 14, - .x = (x + 8192 + (x >> 31)) >> 14 }; + const int frac = num * div_mult[den]; + const int y = mv.y * frac, x = mv.x * frac; + // Round and clip according to AV1 spec section 7.9.3 + return (union mv) { // 0x3fff == (1 << 14) - 1 + .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff), + .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff) + }; } static void add_temporal_candidate(const refmvs_frame *const rf, diff --git a/ffmpeg/JNI/dav1d/src/tables.c b/ffmpeg/JNI/dav1d/src/tables.c index 629deba8e..30d9fa6ae 100644 --- a/ffmpeg/JNI/dav1d/src/tables.c +++ b/ffmpeg/JNI/dav1d/src/tables.c @@ -442,7 +442,7 @@ const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = { 0 }; -const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { +const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = { [DAV1D_FILTER_8TAP_REGULAR] = { { 0, 1, -3, 63, 4, -1, 0, 0 }, { 0, 1, -5, 61, 9, -2, 0, 0 }, @@ -524,6 +524,27 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = { { 0, 0, 2, 20, 31, 11, 0, 0 }, { 0, 0, 2, 18, 31, 13, 0, 0 }, { 0, 0, 1, 17, 31, 15, 0, 0 } +#if ARCH_X86_64 + /* Bilin scaled being very rarely used, add a new table entry + * and use the put/prep_8tap_scaled code, thus acting as a + * scaled bilinear filter. */ + }, [5] = { + { 0, 0, 0, 60, 4, 0, 0, 0 }, + { 0, 0, 0, 56, 8, 0, 0, 0 }, + { 0, 0, 0, 52, 12, 0, 0, 0 }, + { 0, 0, 0, 48, 16, 0, 0, 0 }, + { 0, 0, 0, 44, 20, 0, 0, 0 }, + { 0, 0, 0, 40, 24, 0, 0, 0 }, + { 0, 0, 0, 36, 28, 0, 0, 0 }, + { 0, 0, 0, 32, 32, 0, 0, 0 }, + { 0, 0, 0, 28, 36, 0, 0, 0 }, + { 0, 0, 0, 24, 40, 0, 0, 0 }, + { 0, 0, 0, 20, 44, 0, 0, 0 }, + { 0, 0, 0, 16, 48, 0, 0, 0 }, + { 0, 0, 0, 12, 52, 0, 0, 0 }, + { 0, 0, 0, 8, 56, 0, 0, 0 }, + { 0, 0, 0, 4, 60, 0, 0, 0 } +#endif } }; @@ -636,39 +657,39 @@ const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = { W( 0, 0, 0, 0, 2, 127, - 1, 0 ), }; -const int16_t dav1d_resize_filter[64][8] = { - { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 }, - { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 }, - { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 }, - { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 }, - { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 }, - { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 }, - { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 }, - { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 }, - { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 }, - { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 }, - { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 }, - { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 }, - { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 }, - { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 }, - { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 }, - { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 }, - { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 }, - { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 }, - { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 }, - { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 }, - { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 }, - { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 }, - { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 }, - { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 }, - { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 }, - { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 }, - { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 }, - { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 }, - { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 }, - { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 }, - { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 }, - { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 }, +const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = { + { 0, 0, 0, -128, 0, 0, 0, 0 }, { 0, 0, 1, -128, -2, 1, 0, 0 }, + { 0, -1, 3, -127, -4, 2, -1, 0 }, { 0, -1, 4, -127, -6, 3, -1, 0 }, + { 0, -2, 6, -126, -8, 3, -1, 0 }, { 0, -2, 7, -125, -11, 4, -1, 0 }, + { 1, -2, 8, -125, -13, 5, -2, 0 }, { 1, -3, 9, -124, -15, 6, -2, 0 }, + { 1, -3, 10, -123, -18, 6, -2, 1 }, { 1, -3, 11, -122, -20, 7, -3, 1 }, + { 1, -4, 12, -121, -22, 8, -3, 1 }, { 1, -4, 13, -120, -25, 9, -3, 1 }, + { 1, -4, 14, -118, -28, 9, -3, 1 }, { 1, -4, 15, -117, -30, 10, -4, 1 }, + { 1, -5, 16, -116, -32, 11, -4, 1 }, { 1, -5, 16, -114, -35, 12, -4, 1 }, + { 1, -5, 17, -112, -38, 12, -4, 1 }, { 1, -5, 18, -111, -40, 13, -5, 1 }, + { 1, -5, 18, -109, -43, 14, -5, 1 }, { 1, -6, 19, -107, -45, 14, -5, 1 }, + { 1, -6, 19, -105, -48, 15, -5, 1 }, { 1, -6, 19, -103, -51, 16, -5, 1 }, + { 1, -6, 20, -101, -53, 16, -6, 1 }, { 1, -6, 20, -99, -56, 17, -6, 1 }, + { 1, -6, 20, -97, -58, 17, -6, 1 }, { 1, -6, 20, -95, -61, 18, -6, 1 }, + { 2, -7, 20, -93, -64, 18, -6, 2 }, { 2, -7, 20, -91, -66, 19, -6, 1 }, + { 2, -7, 20, -88, -69, 19, -6, 1 }, { 2, -7, 20, -86, -71, 19, -6, 1 }, + { 2, -7, 20, -84, -74, 20, -7, 2 }, { 2, -7, 20, -81, -76, 20, -7, 1 }, + { 2, -7, 20, -79, -79, 20, -7, 2 }, { 1, -7, 20, -76, -81, 20, -7, 2 }, + { 2, -7, 20, -74, -84, 20, -7, 2 }, { 1, -6, 19, -71, -86, 20, -7, 2 }, + { 1, -6, 19, -69, -88, 20, -7, 2 }, { 1, -6, 19, -66, -91, 20, -7, 2 }, + { 2, -6, 18, -64, -93, 20, -7, 2 }, { 1, -6, 18, -61, -95, 20, -6, 1 }, + { 1, -6, 17, -58, -97, 20, -6, 1 }, { 1, -6, 17, -56, -99, 20, -6, 1 }, + { 1, -6, 16, -53, -101, 20, -6, 1 }, { 1, -5, 16, -51, -103, 19, -6, 1 }, + { 1, -5, 15, -48, -105, 19, -6, 1 }, { 1, -5, 14, -45, -107, 19, -6, 1 }, + { 1, -5, 14, -43, -109, 18, -5, 1 }, { 1, -5, 13, -40, -111, 18, -5, 1 }, + { 1, -4, 12, -38, -112, 17, -5, 1 }, { 1, -4, 12, -35, -114, 16, -5, 1 }, + { 1, -4, 11, -32, -116, 16, -5, 1 }, { 1, -4, 10, -30, -117, 15, -4, 1 }, + { 1, -3, 9, -28, -118, 14, -4, 1 }, { 1, -3, 9, -25, -120, 13, -4, 1 }, + { 1, -3, 8, -22, -121, 12, -4, 1 }, { 1, -3, 7, -20, -122, 11, -3, 1 }, + { 1, -2, 6, -18, -123, 10, -3, 1 }, { 0, -2, 6, -15, -124, 9, -3, 1 }, + { 0, -2, 5, -13, -125, 8, -2, 1 }, { 0, -1, 4, -11, -125, 7, -2, 0 }, + { 0, -1, 3, -8, -126, 6, -2, 0 }, { 0, -1, 3, -6, -127, 4, -1, 0 }, + { 0, -1, 2, -4, -127, 3, -1, 0 }, { 0, 0, 1, -2, -128, 1, 0, 0 }, }; const uint8_t dav1d_sm_weights[128] = { diff --git a/ffmpeg/JNI/dav1d/src/tables.h b/ffmpeg/JNI/dav1d/src/tables.h index 6f8dfd0e1..abcf26592 100644 --- a/ffmpeg/JNI/dav1d/src/tables.h +++ b/ffmpeg/JNI/dav1d/src/tables.h @@ -110,9 +110,9 @@ extern const int8_t dav1d_cdef_directions[12][2]; extern const int16_t dav1d_sgr_params[16][4]; extern const uint8_t dav1d_sgr_x_by_x[256]; -extern const int8_t dav1d_mc_subpel_filters[5][15][8]; +extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8]; extern const int8_t dav1d_mc_warp_filter[193][8]; -extern const int16_t dav1d_resize_filter[64][8]; +extern const int8_t dav1d_resize_filter[64][8]; extern const uint8_t dav1d_sm_weights[128]; extern const uint16_t dav1d_dr_intra_derivative[44]; diff --git a/ffmpeg/JNI/dav1d/src/thread_task.c b/ffmpeg/JNI/dav1d/src/thread_task.c index e05a18684..6c1c13907 100644 --- a/ffmpeg/JNI/dav1d/src/thread_task.c +++ b/ffmpeg/JNI/dav1d/src/thread_task.c @@ -42,8 +42,7 @@ void *dav1d_frame_task(void *const data) { if (f->frame_thread.die) break; pthread_mutex_unlock(&f->frame_thread.td.lock); - const int res = dav1d_decode_frame(f); - if (res) + if (dav1d_decode_frame(f)) memset(f->frame_thread.cf, 0, (size_t)f->frame_thread.cf_sz * 128 * 128 / 2); @@ -92,8 +91,8 @@ void *dav1d_tile_task(void *const data) { for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end; t->by += f->sb_step) { - int error = dav1d_decode_tile_sbrow(t); - int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift); + const int error = dav1d_decode_tile_sbrow(t); + const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift); // signal progress pthread_mutex_lock(&ts->tile_thread.lock); @@ -128,7 +127,7 @@ void *dav1d_tile_task(void *const data) { // waiting for the post-filter to complete t->ts = ts; t->by = sby << f->sb_shift; - int error = dav1d_decode_tile_sbrow(t); + const int error = dav1d_decode_tile_sbrow(t); progress = error ? TILE_ERROR : 1 + sby; // signal progress diff --git a/ffmpeg/JNI/dav1d/src/wedge.c b/ffmpeg/JNI/dav1d/src/wedge.c index 2c292836e..6b14e9a44 100644 --- a/ffmpeg/JNI/dav1d/src/wedge.c +++ b/ffmpeg/JNI/dav1d/src/wedge.c @@ -83,39 +83,39 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = { { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 }, }; -static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 32); -static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 32); -static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 32); -static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 32); -static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 32); -static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 32); -static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 32); -static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 32); -static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 32); - -static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 32); -static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 32); -static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 32); -static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 32); -static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 32); -static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 32); -static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 32); -static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 32); +static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64); +static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64); +static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64); + +static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64); +static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64); +static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64); static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32); -static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 32); -static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 32); -static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 32); -static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 32); -static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 32); -static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 32); -static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 32); +static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64); +static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64); +static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64); +static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64); +static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64); +static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64); +static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64); static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32); -static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 32); +static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16); const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16]; -static void insert_border(uint8_t *const dst, const uint8_t *src, +static void insert_border(uint8_t *const dst, const uint8_t *const src, const int ctr) { if (ctr > 4) memset(dst, 0, ctr - 4); @@ -156,7 +156,8 @@ static void copy2d(uint8_t *dst, const uint8_t *src, } static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma, - const int sign, const int w, const int h, const int ss_ver) + const int sign, const int w, const int h, + const int ss_ver) { for (int y = 0; y < h; y += 1 + ss_ver) { for (int x = 0; x < w; x += 2) { @@ -273,16 +274,16 @@ COLD void dav1d_init_wedge_masks(void) { } #define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1) -static uint8_t ALIGN(ii_dc_mask[32 * 32], 32); -static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 32); -static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 32); -static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 32); -static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 32); -static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 32); -static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 32); -static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 32); +static uint8_t ALIGN(ii_dc_mask[32 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64); +static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64); +static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64); +static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64); +static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64); static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32); -static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 32); +static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16); #undef N_II_PRED_MODES #define set1(sz) \ diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef.asm b/ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm similarity index 88% rename from ffmpeg/JNI/dav1d/src/x86/cdef.asm rename to ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm index cd632b133..643caa0cf 100644 --- a/ffmpeg/JNI/dav1d/src/x86/cdef.asm +++ b/ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm @@ -27,22 +27,6 @@ %if ARCH_X86_64 -%macro DUP4 1-* - %rep %0 - times 4 db %1 - %rotate 1 - %endrep -%endmacro - -%macro DIRS 16 ; cdef_directions[] - %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 - ; masking away unused bits allows us to use a single vpaddd {1to16} - ; instruction instead of having to do vpbroadcastd + paddb - db %13 & 0x3f, -%13 & 0x3f - %rotate 1 - %endrep -%endmacro - %macro JMP_TABLE 2-* %xdefine %1_jmptable %%table %xdefine %%base mangle(private_prefix %+ _%1_avx2) @@ -61,30 +45,9 @@ JMP_TABLE cdef_filter_%1, \ d0k0, d0k1, d1k0, d1k1 %endmacro -SECTION_RODATA 64 - -lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 - db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 - db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 - db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 -edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 - dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 - dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 - dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 - dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 - dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 - dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 - dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 -px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 -cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 -gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 - dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 - dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 - dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 -end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 -pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 -sec_tap: db 32, 32, 16, 16 -pd_268435568: dd 268435568 +SECTION_RODATA 32 + +pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00 dd 0x80, 0x00, 0x00 blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 @@ -96,7 +59,6 @@ blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080 dd 0x0000, 0x0000 blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80 blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000 -pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6 div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105 shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15 shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 @@ -497,14 +459,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd @@ -725,7 +687,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movu xm9, [dstq+strideq*1-1] vinserti128 m5, [dstq+strideq*2-1], 1 vinserti128 m9, [dstq+stride3q -1], 1 - mova m10, [blend_8x8_0+16] + movu m10, [blend_8x8_0+16] punpcklqdq m6, m5, m9 vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10 psrldq m5, 2 @@ -1506,14 +1468,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \ movifnidn prid, prim sub dampingd, 31 movifnidn secdmpd, secdmpm - or prid, 0 + test prid, prid jz .border_sec_only movd xm0, prid lzcnt pridmpd, prid add pridmpd, dampingd cmovs pridmpd, zerod mov [rsp+0], pridmpq ; pri_shift - or secdmpd, 0 + test secdmpd, secdmpd jz .border_pri_only movd xm1, secdmpd lzcnt secdmpd, secdmpd @@ -1833,169 +1795,4 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3 movd [varq], xm2 RET -%if WIN64 -DECLARE_REG_TMP 5, 6 -%else -DECLARE_REG_TMP 8, 5 -%endif - -; lut: -; t0 t1 t2 t3 t4 t5 t6 t7 -; T0 T1 T2 T3 T4 T5 T6 T7 -; L0 L1 00 01 02 03 04 05 -; L2 L3 10 11 12 13 14 15 -; L4 L5 20 21 22 23 24 25 -; L6 L7 30 31 32 33 34 35 -; 4e 4f 40 41 42 43 44 45 -; 5e 5f 50 51 52 53 54 55 - -INIT_ZMM avx512icl -cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge -%define base r7-edge_mask - movq xmm0, [dstq+strideq*0] - movhps xmm0, [dstq+strideq*1] - lea r7, [edge_mask] - movq xmm1, [topq+strideq*0-2] - movhps xmm1, [topq+strideq*1-2] - mov r6d, edgem - vinserti32x4 ym0, ymm0, [leftq], 1 - lea r2, [strideq*3] - vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 - mova m5, [base+lut_perm_4x4] - vinserti32x4 m0, [dstq+r2], 2 - test r6b, 0x08 ; avoid buffer overread - jz .main - lea r3, [dstq+strideq*4-4] - vinserti32x4 m1, [r3+strideq*0], 2 - vinserti32x4 m0, [r3+strideq*1], 3 -.main: - movifnidn prid, prim - mov t0d, dirm - mova m3, [base+px_idx] - mov r3d, dampingm - vpermi2b m5, m0, m1 ; lut - vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) - pxor m7, m7 - lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 - vpermb m6, m3, m5 ; px - cmp r6d, 0x0f - jne .mask_edges ; mask edges only if required - test prid, prid - jz .sec_only - vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir - vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 -%macro CDEF_FILTER_4x4_PRI 0 - vpcmpub k1, m6, m1, 6 ; px > pN - psubb m2, m1, m6 - lzcnt r6d, prid - vpsubb m2{k1}, m6, m1 ; abs(diff) - vpbroadcastb m4, prim - and prid, 1 - vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift - movifnidn t1d, secm - vpbroadcastd m10, [base+pri_tap+priq*4] - vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) - psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) - pminub m2, m4 - vpdpbusd m0, m2, m10 ; sum -%endmacro - CDEF_FILTER_4x4_PRI - test t1d, t1d ; sec - jz .end_no_clip - call .sec -.end_clip: - pminub m4, m6, m1 - pmaxub m1, m6 - pminub m5, m2, m3 - pmaxub m2, m3 - pminub m4, m5 - pmaxub m2, m1 - psrldq m1, m4, 2 - psrldq m3, m2, 2 - pminub m1, m4 - vpcmpw k1, m0, m7, 1 - vpshldd m6, m0, 8 - pmaxub m2, m3 - pslldq m3, m1, 1 - psubw m7, m0 - paddusw m0, m6 ; clip >0xff - vpsubusw m0{k1}, m6, m7 ; clip <0x00 - pslldq m4, m2, 1 - pminub m1, m3 - pmaxub m2, m4 - pmaxub m0, m1 - pminub m0, m2 - jmp .end -.sec_only: - movifnidn t1d, secm - call .sec -.end_no_clip: - vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) - paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) -.end: - mova xm1, [base+end_perm] - vpermb m0, m1, m0 ; output in bits 8-15 of each dword - movd [dstq+strideq*0], xm0 - pextrd [dstq+strideq*1], xm0, 1 - pextrd [dstq+strideq*2], xm0, 2 - pextrd [dstq+r2 ], xm0, 3 - RET -.mask_edges_sec_only: - movifnidn t1d, secm - call .mask_edges_sec - jmp .end_no_clip -ALIGN function_align -.mask_edges: - vpbroadcastq m8, [base+edge_mask+r6*8] - test prid, prid - jz .mask_edges_sec_only - vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} - vpshufbitqmb k1, m8, m2 ; index in-range - mova m1, m6 - vpermb m1{k1}, m2, m5 - CDEF_FILTER_4x4_PRI - test t1d, t1d - jz .end_no_clip - call .mask_edges_sec - jmp .end_clip -.mask_edges_sec: - vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} - vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} - vpshufbitqmb k1, m8, m4 - mova m2, m6 - vpermb m2{k1}, m4, m5 - vpshufbitqmb k1, m8, m9 - mova m3, m6 - vpermb m3{k1}, m9, m5 - jmp .sec_main -ALIGN function_align -.sec: - vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 - vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 - vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 - vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 -.sec_main: - vpbroadcastd m8, [base+sec_tap] - vpcmpub k1, m6, m2, 6 - psubb m4, m2, m6 - vpbroadcastb m12, t1d - lzcnt t1d, t1d - vpsubb m4{k1}, m6, m2 - vpcmpub k2, m6, m3, 6 - vpbroadcastq m11, [r3+t1*8] - gf2p8affineqb m10, m4, m11, 0 - psubb m5, m3, m6 - mova m9, m8 - vpsubb m8{k1}, m7, m8 - psubusb m10, m12, m10 - vpsubb m5{k2}, m6, m3 - pminub m4, m10 - vpdpbusd m0, m4, m8 - gf2p8affineqb m11, m5, m11, 0 - vpsubb m9{k2}, m7, m9 - psubusb m12, m11 - pminub m5, m12 - vpdpbusd m0, m5, m9 - ret - %endif ; ARCH_X86_64 diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm b/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm new file mode 100644 index 000000000..e7eee9ebf --- /dev/null +++ b/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm @@ -0,0 +1,867 @@ +; Copyright © 2020, VideoLAN and dav1d authors +; Copyright © 2020, Two Orioles, LLC +; All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions are met: +; +; 1. Redistributions of source code must retain the above copyright notice, this +; list of conditions and the following disclaimer. +; +; 2. Redistributions in binary form must reproduce the above copyright notice, +; this list of conditions and the following disclaimer in the documentation +; and/or other materials provided with the distribution. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +%include "ext/x86/x86inc.asm" + +%if HAVE_AVX512ICL && ARCH_X86_64 + +%macro DUP4 1-* + %rep %0 + times 4 db %1 + %rotate 1 + %endrep +%endmacro + +%macro DIRS 16 ; cdef_directions[] + %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1 + ; masking away unused bits allows us to use a single vpaddd {1to16} + ; instruction instead of having to do vpbroadcastd + paddb + db %13 & 0x3f, -%13 & 0x3f + %rotate 1 + %endrep +%endmacro + +SECTION_RODATA 64 + +lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13 + db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37 + db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57 +lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79 + db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13 +lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29 + db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45 + db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61 + db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95 +pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7 +lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55 + db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87 + db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119 +lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27 + db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59 + db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91 + db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123 +edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001 + dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011 + dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101 + dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111 + dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001 + dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011 + dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101 + dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111 +px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45 +cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15 +gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0 + dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2 + dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4 + dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6 + times 16 db 0 ; realign (introduced by cdef_dirs) +end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30 + db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62 + db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31 + db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63 +end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61 + db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 +pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4 +sec_tap: db 32, 32, 16, 16 +pd_268435568: dd 268435568 + +SECTION .text + +%if WIN64 +DECLARE_REG_TMP 5, 6 +%else +DECLARE_REG_TMP 8, 5 +%endif + +; lut: +; t0 t1 t2 t3 t4 t5 t6 t7 +; T0 T1 T2 T3 T4 T5 T6 T7 +; L0 L1 00 01 02 03 04 05 +; L2 L3 10 11 12 13 14 15 +; L4 L5 20 21 22 23 24 25 +; L6 L7 30 31 32 33 34 35 +; 4e 4f 40 41 42 43 44 45 +; 5e 5f 50 51 52 53 54 55 + +INIT_ZMM avx512icl +cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge +%define base r7-edge_mask + movq xmm0, [dstq+strideq*0] + movhps xmm0, [dstq+strideq*1] + lea r7, [edge_mask] + movq xmm1, [topq+strideq*0-2] + movhps xmm1, [topq+strideq*1-2] + mov r6d, edgem + vinserti32x4 ym0, ymm0, [leftq], 1 + lea r2, [strideq*3] + vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1 + mova m5, [base+lut_perm_4x4] + vinserti32x4 m0, [dstq+r2], 2 + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r3, [dstq+strideq*4-4] + vinserti32x4 m1, [r3+strideq*0], 2 + vinserti32x4 m0, [r3+strideq*1], 3 +.main: + movifnidn prid, prim + mov t0d, dirm + mova m3, [base+px_idx] + mov r3d, dampingm + vpermi2b m5, m0, m1 ; lut + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m7, m7 + lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m6, m3, m5 ; px + cmp r6d, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1 +%macro CDEF_FILTER_4x4_PRI 0 + vpcmpub k1, m6, m1, 6 ; px > pN + psubb m2, m1, m6 + lzcnt r6d, prid + vpsubb m2{k1}, m6, m1 ; abs(diff) + vpbroadcastb m4, prid + and prid, 1 + vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift + movifnidn t1d, secm + vpbroadcastd m10, [base+pri_tap+priq*4] + vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap) + psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift))) + pminub m2, m4 + vpdpbusd m0, m2, m10 ; sum +%endmacro + CDEF_FILTER_4x4_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m4, m6, m1 + pmaxub m1, m6 + pminub m5, m2, m3 + pmaxub m2, m3 + pminub m4, m5 + pmaxub m2, m1 + psrldq m1, m4, 2 + psrldq m3, m2, 2 + pminub m1, m4 + vpcmpw k1, m0, m7, 1 + vpshldd m6, m0, 8 + pmaxub m2, m3 + pslldq m3, m1, 1 + psubw m7, m0 + paddusw m0, m6 ; clip >0xff + vpsubusw m0{k1}, m6, m7 ; clip <0x00 + pslldq m4, m2, 1 + pminub m1, m3 + pmaxub m2, m4 + pmaxub m0, m1 + pminub m0, m2 + jmp .end +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4) + paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) +.end: + mova xm1, [base+end_perm] + vpermb m0, m1, m0 ; output in bits 8-15 of each dword + movd [dstq+strideq*0], xm0 + pextrd [dstq+strideq*1], xm0, 1 + pextrd [dstq+strideq*2], xm0, 2 + pextrd [dstq+r2 ], xm0, 3 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + vpbroadcastq m8, [base+edge_mask+r6*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m8, m2 ; index in-range + mova m1, m6 + vpermb m1{k1}, m2, m5 + CDEF_FILTER_4x4_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m8, m4 + mova m2, m6 + vpermb m2{k1}, m4, m5 + vpshufbitqmb k1, m8, m9 + mova m3, m6 + vpermb m3{k1}, m9, m5 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1 + vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3 +.sec_main: + vpbroadcastd m8, [base+sec_tap] + vpcmpub k1, m6, m2, 6 + psubb m4, m2, m6 + vpbroadcastb m12, t1d + lzcnt t1d, t1d + vpsubb m4{k1}, m6, m2 + vpcmpub k2, m6, m3, 6 + vpbroadcastq m11, [r3+t1*8] + gf2p8affineqb m10, m4, m11, 0 + psubb m5, m3, m6 + mova m9, m8 + vpsubb m8{k1}, m7, m8 + psubusb m10, m12, m10 + vpsubb m5{k2}, m6, m3 + pminub m4, m10 + vpdpbusd m0, m4, m8 + gf2p8affineqb m11, m5, m11, 0 + vpsubb m9{k2}, m7, m9 + psubusb m12, m11 + pminub m5, m12 + vpdpbusd m0, m5, m9 + ret + +DECLARE_REG_TMP 2, 7 + +; lut top lut bottom +; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25 +; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35 +; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45 +; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55 +; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65 +; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75 +; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85 +; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95 + +cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + vpbroadcastd ym21, strided + mov r6d, edgem + lea r8, [edge_mask] + movq xm1, [topq+strideq*0-2] + pmulld ym21, [base+pd_01234567] + kxnorb k1, k1, k1 + movq xm2, [topq+strideq*1-2] + vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7 + mova m14, [base+lut_perm_4x8a] + movu m15, [base+lut_perm_4x8b] + test r6b, 0x08 ; avoid buffer overread + jz .main + lea r7, [dstq+strideq*8-2] + vinserti32x4 ym1, [r7+strideq*0], 1 + vinserti32x4 ym2, [r7+strideq*1], 1 +.main: + punpcklqdq ym1, ym2 + vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____ + movifnidn prid, prim + mov t0d, dirm + mova m16, [base+px_idx] + mov r3d, dampingm + vpermi2b m14, m0, m1 ; lut top + vpermi2b m15, m0, m1 ; lut bottom + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m20, m20 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m2, m16, m14 ; pxt + vpermb m3, m16, m15 ; pxb + mova m1, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1 + vpermb m5, m6, m15 ; pNb +%macro CDEF_FILTER_4x8_PRI 0 + vpcmpub k1, m2, m4, 6 ; pxt > pNt + vpcmpub k2, m3, m5, 6 ; pxb > pNb + psubb m6, m4, m2 + psubb m7, m5, m3 + lzcnt r6d, prid + vpsubb m6{k1}, m2, m4 ; abs(diff_top) + vpsubb m7{k2}, m3, m5 ; abs(diff_bottom) + vpbroadcastb m13, prid + vpbroadcastq m9, [r3+r6*8] + and prid, 1 + vpbroadcastd m11, [base+pri_tap+priq*4] + vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift + vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift + mova m10, m11 + movifnidn t1d, secm + vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top) + vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom) + psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift))) + psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift))) + pminub m6, m12 + pminub m7, m13 + vpdpbusd m0, m6, m10 ; sum top + vpdpbusd m1, m7, m11 ; sum bottom +%endmacro + CDEF_FILTER_4x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m10, m4, m2 + pminub m12, m6, m8 + pminub m11, m5, m3 + pminub m13, m7, m9 + pmaxub m4, m2 + pmaxub m6, m8 + pmaxub m5, m3 + pmaxub m7, m9 + pminub m10, m12 + pminub m11, m13 + pmaxub m4, m6 + pmaxub m5, m7 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + kxnorb k2, k2, k2 ; hw lw + vpshrdd m12, m0, m1, 16 ; m1lw m0hw + vpshrdd m6, m10, m11, 16 ; m11lw m10hw + vpshrdd m8, m4, m5, 16 ; m5lw m4hw + vpblendmw m7{k1}, m10, m11 ; m11hw m10lw + vpblendmw m9{k1}, m4, m5 ; m5hw m4lw + vpblendmw m4{k1}, m0, m12 ; m1lw m0lw + vpblendmw m5{k1}, m12, m1 ; m1hw m0hw + vpshrdd m2, m3, 16 + pminub m6, m7 + pmaxub m8, m9 + mova ym14, [base+end_perm] + vpcmpw k1, m4, m20, 1 + vpshldw m2, m5, 8 + pslldq m7, m6, 1 + pslldq m9, m8, 1 + psubw m5, m20, m4 + paddusw m0, m4, m2 ; clip >0xff + pminub m6, m7 + pmaxub m8, m9 + psubusw m0{k1}, m2, m5 ; clip <0x00 + pmaxub m0, m6 + pminub m0, m8 + vpermb m0, m14, m0 + vpscatterdd [dstq+ym21]{k2}, ym0 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova ym4, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m3, m1, 8 + paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m3 + pslld m0, 16 + vpshrdd m0, m1, 16 + vpermb m0, m4, m0 ; output in bits 8-15 of each word + vpscatterdd [dstq+ym21]{k1}, ym0 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t1d, r6d + or r6d, 8 ; top 4x4 has bottom + or t1d, 4 ; bottom 4x4 has top + vpbroadcastq m17, [base+edge_mask+r6*8] + vpbroadcastq m18, [base+edge_mask+t1*8] + test prid, prid + jz .mask_edges_sec_only + vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m17, m6 ; index in-range + vpshufbitqmb k2, m18, m6 + mova m4, m2 + mova m5, m3 + vpermb m4{k1}, m6, m14 + vpermb m5{k2}, m6, m15 + CDEF_FILTER_4x8_PRI + test t1d, t1d + jz .end_no_clip + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m17, m10 + vpshufbitqmb k2, m18, m10 + vpshufbitqmb k3, m17, m11 + vpshufbitqmb k4, m18, m11 + mova m6, m2 + mova m7, m3 + mova m8, m2 + mova m9, m3 + vpermb m6{k1}, m10, m14 + vpermb m7{k2}, m10, m15 + vpermb m8{k3}, m11, m14 + vpermb m9{k4}, m11, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1 + vpermb m7, m8, m15 ; pNb + vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3 + vpermb m9, m9, m15 ; pNb +.sec_main: + vpbroadcastb m18, t1d + lzcnt t1d, t1d + vpcmpub k1, m2, m6, 6 + vpcmpub k2, m3, m7, 6 + vpcmpub k3, m2, m8, 6 + vpcmpub k4, m3, m9, 6 + vpbroadcastq m17, [r3+t1*8] + psubb m10, m6, m2 + psubb m11, m7, m3 + psubb m12, m8, m2 + psubb m13, m9, m3 + vpsubb m10{k1}, m2, m6 ; abs(dt0) + vpsubb m11{k2}, m3, m7 ; abs(db0) + vpsubb m12{k3}, m2, m8 ; abs(dt1) + vpsubb m13{k4}, m3, m9 ; abs(db1) + vpbroadcastd m19, [base+sec_tap] + gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift + gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift + gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift + gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift + psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift))) + psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift))) + psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift))) + psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift))) + pminub m10, m14 + pminub m11, m15 + pminub m12, m16 + pminub m13, m17 + mova m14, m19 + mova m15, m19 + mova m16, m19 + vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0) + vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0) + vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1) + vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1) + vpdpbusd m0, m10, m14 + vpdpbusd m1, m11, m15 + vpdpbusd m0, m12, m16 + vpdpbusd m1, m13, m19 + ret + +; lut tl lut tr +; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td +; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD +; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b +; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; lut bl lut br +; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b +; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b +; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b +; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b +; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b +; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b +; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b +; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b + +cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \ + pri, sec, dir, damping, edge +%define base r8-edge_mask + mov r6d, edgem + lea r10, [dstq+strideq*4-2] + movu xmm0, [topq+strideq*0-2] + movu xmm1, [dstq+strideq*2-2] + movu xmm2, [r10 +strideq*2 ] + lea r8, [edge_mask] + lea r9, [strideq*3] + pmovzxwq m10, [leftq-4] + vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1 + vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1 + vinserti32x4 ym2, ymm2, [r10 +r9 ], 1 + lea r7, [r10 +strideq*4 ] + pmovzxwq m11, [leftq+4] + vinserti32x4 m0, [dstq+strideq*0-2], 2 + vinserti32x4 m1, [r10 +strideq*0 ], 2 + mova m12, [base+lut_perm_8x8a] + movu m13, [base+lut_perm_8x8b] + vinserti32x4 m0, [dstq+strideq*1-2], 3 + vinserti32x4 m1, [r10 +strideq*1 ], 3 + test r6b, 0x08 ; avoid buffer overread + jz .main + vinserti32x4 m2, [r7 +strideq*0], 2 + vinserti32x4 m2, [r7 +strideq*1], 3 +.main: + mov t1d, 0x11111100 + mova m14, m12 + mova m15, m13 + kmovd k1, t1d + kshiftrd k2, k1, 8 + movifnidn prid, prim + mov t0d, dirm + mova m30, [base+px_idx] + mov r3d, dampingm + vpermi2b m12, m0, m1 ; lut tl + vpermi2b m14, m1, m2 ; lut bl + vpermi2b m13, m0, m1 ; lut tr + vpermi2b m15, m1, m2 ; lut br + vpblendmw m12{k1}, m12, m10 + vpblendmw m14{k2}, m14, m11 + vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4) + pxor m31, m31 + lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8 + vpermb m4, m30, m12 ; pxtl + vpermb m5, m30, m13 ; pxtr + vpermb m6, m30, m14 ; pxbl + vpermb m7, m30, m15 ; pxbr + mova m1, m0 + mova m2, m0 + mova m3, m0 + cmp r6b, 0x0f + jne .mask_edges ; mask edges only if required + test prid, prid + jz .sec_only + vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir + vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1 + vpermb m9, m11, m13 ; pNtr + vpermb m10, m11, m14 ; pNbl + vpermb m11, m11, m15 ; pNbr +%macro CDEF_FILTER_8x8_PRI 0 + vpcmpub k1, m4, m8, 6 ; pxtl > pNtl + vpcmpub k2, m5, m9, 6 ; pxtr > pNtr + vpcmpub k3, m6, m10, 6 ; pxbl > pNbl + vpcmpub k4, m7, m11, 6 ; pxbr > pNbr + psubb m16, m8, m4 + psubb m17, m9, m5 + psubb m18, m10, m6 + psubb m19, m11, m7 + lzcnt r6d, prid + vpsubb m16{k1}, m4, m8 ; abs(diff_tl) + vpsubb m17{k2}, m5, m9 ; abs(diff_tr) + vpsubb m18{k3}, m6, m10 ; abs(diff_bl) + vpsubb m19{k4}, m7, m11 ; abs(diff_br) + vpbroadcastq m28, [r3+r6*8] + vpbroadcastb m29, prid + and prid, 1 + vpbroadcastd m27, [base+pri_tap+priq*4] + vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift + vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift + vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift + vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift + mova m24, m27 + mova m25, m27 + mova m26, m27 + movifnidn t1d, secm + vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr) + vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl) + vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr) + psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift))) + psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift))) + psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift))) + psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift))) + pminub m16, m20 + pminub m17, m21 + pminub m18, m22 + pminub m19, m23 + vpdpbusd m0, m16, m24 ; sum tl + vpdpbusd m1, m17, m25 ; sum tr + vpdpbusd m2, m18, m26 ; sum bl + vpdpbusd m3, m19, m27 ; sum br +%endmacro + CDEF_FILTER_8x8_PRI + test t1d, t1d ; sec + jz .end_no_clip + call .sec +.end_clip: + pminub m20, m8, m4 + pminub m24, m12, m16 + pminub m21, m9, m5 + pminub m25, m13, m17 + pminub m22, m10, m6 + pminub m26, m14, m18 + pminub m23, m11, m7 + pminub m27, m15, m19 + pmaxub m8, m4 + pmaxub m12, m16 + pmaxub m9, m5 + pmaxub m13, m17 + pmaxub m10, m6 + pmaxub m14, m18 + pmaxub m11, m7 + pmaxub m15, m19 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + pmaxub m8, m12 + pmaxub m9, m13 + pmaxub m10, m14 + pmaxub m11, m15 + mov r2d, 0xAAAAAAAA + kmovd k1, r2d + vpshrdd m24, m0, m1, 16 + vpshrdd m25, m2, m3, 16 + vpshrdd m12, m20, m21, 16 + vpshrdd m14, m22, m23, 16 + vpshrdd m16, m8, m9, 16 + vpshrdd m18, m10, m11, 16 + vpblendmw m13{k1}, m20, m21 + vpblendmw m15{k1}, m22, m23 + vpblendmw m17{k1}, m8, m9 + vpblendmw m19{k1}, m10, m11 + vpblendmw m20{k1}, m0, m24 + vpblendmw m21{k1}, m24, m1 + vpblendmw m22{k1}, m2, m25 + vpblendmw m23{k1}, m25, m3 + vpshrdd m4, m5, 16 + vpshrdd m6, m7, 16 + pminub m12, m13 + pminub m14, m15 + pmaxub m16, m17 + pmaxub m18, m19 + mova m8, [base+end_perm_w8clip] + vpcmpw k2, m20, m31, 1 + vpcmpw k3, m22, m31, 1 + vpshldw m4, m21, 8 + vpshldw m6, m23, 8 + kunpckdq k1, k1, k1 + kxnorb k4, k4, k4 + vpshrdw m11, m12, m14, 8 + vpshrdw m15, m16, m18, 8 + vpblendmb m13{k1}, m12, m14 + vpblendmb m17{k1}, m16, m18 + psubw m21, m31, m20 + psubw m23, m31, m22 + paddusw m0, m20, m4 ; clip >0xff + paddusw m1, m22, m6 + pminub m11, m13 + pmaxub m15, m17 + psubusw m0{k2}, m4, m21 ; clip <0x00 + psubusw m1{k3}, m6, m23 + psrlw m0, 8 + vmovdqu8 m0{k1}, m1 + pmaxub m0, m11 + pminub m0, m15 + vpermb m0, m8, m0 + add r10, 2 + vextracti32x4 xm1, m0, 1 + vextracti32x4 xm2, m0, 2 + vextracti32x4 xm3, m0, 3 + movq [dstq+strideq*0], xm0 + movq [dstq+strideq*2], xm1 + movq [r10 +strideq*0], xm2 + movq [r10 +strideq*2], xm3 + movhps [dstq+strideq*1], xm0 + movhps [dstq+r9 ], xm1 + movhps [r10 +strideq*1], xm2 + movhps [r10 +r9 ], xm3 + RET +.sec_only: + movifnidn t1d, secm + call .sec +.end_no_clip: + mova xm8, [base+end_perm] + kxnorb k1, k1, k1 + vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4) + vpshldd m5, m1, 8 + vpshldd m6, m2, 8 + vpshldd m7, m3, 8 + paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4) + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + vpermb m0, m8, m0 + vpermb m1, m8, m1 + vpermb m2, m8, m2 + vpermb m3, m8, m3 + add r10, 2 + punpckldq m4, m0, m1 + punpckhdq m0, m1 + punpckldq m5, m2, m3 + punpckhdq m2, m3 + movq [dstq+strideq*0], xm4 + movq [dstq+strideq*2], xm0 + movq [r10 +strideq*0], xm5 + movq [r10 +strideq*2], xm2 + movhps [dstq+strideq*1], xm4 + movhps [dstq+r9 ], xm0 + movhps [r10 +strideq*1], xm5 + movhps [r10 +r9 ], xm2 + RET +.mask_edges_sec_only: + movifnidn t1d, secm + call .mask_edges_sec + jmp .end_no_clip +ALIGN function_align +.mask_edges: + mov t0d, r6d + mov t1d, r6d + or t0d, 0xA ; top-left 4x4 has bottom and right + or t1d, 0x9 ; top-right 4x4 has bottom and left + vpbroadcastq m26, [base+edge_mask+t0*8] + vpbroadcastq m27, [base+edge_mask+t1*8] + mov t1d, r6d + or r6d, 0x6 ; bottom-left 4x4 has top and right + or t1d, 0x5 ; bottom-right 4x4 has top and left + vpbroadcastq m28, [base+edge_mask+r6*8] + vpbroadcastq m29, [base+edge_mask+t1*8] + mov t0d, dirm + test prid, prid + jz .mask_edges_sec_only + vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16} + vpshufbitqmb k1, m26, m20 ; index in-range + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m8, m4 + mova m9, m5 + mova m10, m6 + mova m11, m7 + vpermb m8{k1}, m20, m12 + vpermb m9{k2}, m20, m13 + vpermb m10{k3}, m20, m14 + vpermb m11{k4}, m20, m15 + mova [rsp+0x00], m26 + mova [rsp+0x40], m27 + mova [rsp+0x80], m28 + mova [rsp+0xC0], m29 + CDEF_FILTER_8x8_PRI + test t1d, t1d + jz .end_no_clip + mova m26, [rsp+0x00] + mova m27, [rsp+0x40] + mova m28, [rsp+0x80] + mova m29, [rsp+0xC0] + call .mask_edges_sec + jmp .end_clip +.mask_edges_sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} + vpshufbitqmb k1, m26, m20 + vpshufbitqmb k2, m27, m20 + vpshufbitqmb k3, m28, m20 + vpshufbitqmb k4, m29, m20 + mova m16, m4 + mova m17, m5 + mova m18, m6 + mova m19, m7 + vpermb m16{k1}, m20, m12 + vpermb m17{k2}, m20, m13 + vpermb m18{k3}, m20, m14 + vpermb m19{k4}, m20, m15 + vpshufbitqmb k1, m26, m21 + vpshufbitqmb k2, m27, m21 + vpshufbitqmb k3, m28, m21 + vpshufbitqmb k4, m29, m21 + vpermb m12, m21, m12 + vpermb m13, m21, m13 + vpermb m14, m21, m14 + vpermb m15, m21, m15 + vpblendmb m12{k1}, m4, m12 + vpblendmb m13{k2}, m5, m13 + vpblendmb m14{k3}, m6, m14 + vpblendmb m15{k4}, m7, m15 + jmp .sec_main +ALIGN function_align +.sec: + vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2 + vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2 + vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1 + vpermb m17, m20, m13 ; pNtr + vpermb m18, m20, m14 ; pNbl + vpermb m19, m20, m15 ; pNbr + vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3 + vpermb m13, m21, m13 ; pNtr + vpermb m14, m21, m14 ; pNbl + vpermb m15, m21, m15 ; pNbr +.sec_main: +%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants + vpcmpub k1, m4, %1, 6 + vpcmpub k2, m5, %2, 6 + vpcmpub k3, m6, %3, 6 + vpcmpub k4, m7, %4, 6 + psubb m20, %1, m4 + psubb m21, %2, m5 + psubb m22, %3, m6 + psubb m23, %4, m7 +%if %5 + vpbroadcastb m28, t1d + lzcnt t1d, t1d + vpbroadcastq m29, [r3+t1*8] +%endif + vpsubb m20{k1}, m4, %1 + vpsubb m21{k2}, m5, %2 + vpsubb m22{k3}, m6, %3 + vpsubb m23{k4}, m7, %4 + gf2p8affineqb m24, m20, m29, 0 + gf2p8affineqb m25, m21, m29, 0 + gf2p8affineqb m26, m22, m29, 0 + gf2p8affineqb m27, m23, m29, 0 +%if %5 + vpbroadcastd m30, [base+sec_tap] +%endif + psubusb m24, m28, m24 + psubusb m25, m28, m25 + psubusb m26, m28, m26 + psubusb m27, m28, m27 + pminub m20, m24 + pminub m21, m25 + pminub m22, m26 + pminub m23, m27 + mova m24, m30 + mova m25, m30 + mova m26, m30 + mova m27, m30 + vpsubb m24{k1}, m31, m30 + vpsubb m25{k2}, m31, m30 + vpsubb m26{k3}, m31, m30 + vpsubb m27{k4}, m31, m30 + vpdpbusd m0, m20, m24 + vpdpbusd m1, m21, m25 + vpdpbusd m2, m22, m26 + vpdpbusd m3, m23, m27 +%endmacro + CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1 + CDEF_FILTER_8x8_SEC m12, m13, m14, m15 + ret + +%endif ; HAVE_AVX512ICL && ARCH_X86_64 diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c index e9077fc7e..edc3b5d4b 100644 --- a/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c @@ -84,7 +84,9 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) { if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; -#if BITDEPTH == 8 +#if HAVE_AVX512ICL && BITDEPTH == 8 + c->fb[0] = dav1d_cdef_filter_8x8_avx512icl; + c->fb[1] = dav1d_cdef_filter_4x8_avx512icl; c->fb[2] = dav1d_cdef_filter_4x4_avx512icl; #endif diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain.asm b/ffmpeg/JNI/dav1d/src/x86/film_grain.asm index 5b596aba0..94ee123a9 100644 --- a/ffmpeg/JNI/dav1d/src/x86/film_grain.asm +++ b/ffmpeg/JNI/dav1d/src/x86/film_grain.asm @@ -28,6 +28,8 @@ %if ARCH_X86_64 SECTION_RODATA 32 +pb_8x_27_17_8x_17_27: times 8 db 27, 17 + times 8 db 17, 27 pw_1024: times 16 dw 1024 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058 @@ -60,6 +62,8 @@ pw_1: dw 1 ALIGN 4 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -413,8 +417,9 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data jg .y_loop_ar3 RET +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM avx2 -cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv lea r4, [pb_mask] %define base r4-pb_mask movq xm1, [base+rnd_next_upperbit_mask] @@ -428,11 +433,17 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv pxor xm0, xm9 vpbroadcastd xm9, [base+pd_m65536] lea r6, [gaussian_sequence] - mov r7d, 38 +%if %2 + mov r7d, 73-35*%3 add bufq, 44 .loop_y: mov r5, -44 .loop_x: +%else + mov r5, -73*82 + sub bufq, r5 +.loop: +%endif pand xm2, xm0, xm1 psrlw xm3, xm2, 10 por xm2, xm3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set @@ -455,15 +466,19 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv packsswb xm2, xm2 movd [bufq+r5], xm2 add r5, 4 +%if %2 jl .loop_x add bufq, 82 dec r7d jg .loop_y +%else + jl .loop +%endif ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_420_avx2_table+r5*4] - lea r5, [r5+base+generate_grain_uv_420_avx2_table] + movsxd r5, [base+generate_grain_uv_%1_avx2_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_avx2_table] jmp r5 .ar0: @@ -475,63 +490,126 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv movd xm3, [base+hmul_bits+shiftq*2] DEFINE_ARGS buf, bufy, h pmovsxbw xm4, xm4 +%if %2 vpbroadcastd m7, [pb_1] - vpbroadcastw m6, [hmul_bits+4] + vpbroadcastw m6, [hmul_bits+2+%3*2] +%endif vpbroadcastw m4, xm4 vpbroadcastw m3, xm3 - sub bufq, 82*38+82-(82*3+41) + pxor m12, m12 +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif add bufyq, 3+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar0: +%if %2 ; first 32 pixels movu xm8, [bufyq] +%if %3 movu xm9, [bufyq+82] +%endif movu xm10, [bufyq+16] +%if %3 movu xm11, [bufyq+82+16] +%endif vinserti128 m8, [bufyq+32], 1 +%if %3 vinserti128 m9, [bufyq+82+32], 1 +%endif vinserti128 m10, [bufyq+48], 1 +%if %3 vinserti128 m11, [bufyq+82+48], 1 +%endif pmaddubsw m8, m7, m8 +%if %3 pmaddubsw m9, m7, m9 +%endif pmaddubsw m10, m7, m10 +%if %3 pmaddubsw m11, m7, m11 paddw m8, m9 paddw m10, m11 +%endif pmulhrsw m8, m6 pmulhrsw m10, m6 +%else + xor r3d, r3d + ; first 32x2 pixels +.x_loop_ar0: + movu m8, [bufyq+r3] + pcmpgtb m9, m12, m8 + punpckhbw m10, m8, m9 + punpcklbw m8, m9 +%endif pmullw m8, m4 pmullw m10, m4 pmulhrsw m8, m3 pmulhrsw m10, m3 - packsswb m8, m10 +%if %2 movu m0, [bufq] - punpckhbw m1, m0, m8 - punpcklbw m0, m8 - pmaddubsw m1, m7, m1 - pmaddubsw m0, m7, m0 - packsswb m0, m1 +%else + movu m0, [bufq+r3] +%endif + pcmpgtb m1, m12, m0 + punpckhbw m9, m0, m1 + punpcklbw m0, m1 + paddw m0, m8 + paddw m9, m10 + packsswb m0, m9 +%if %2 movu [bufq], m0 +%else + movu [bufq+r3], m0 + add r3d, 32 + cmp r3d, 64 + jl .x_loop_ar0 +%endif - ; last 6 pixels + ; last 6/12 pixels movu xm8, [bufyq+32*2] +%if %2 +%if %3 movu xm9, [bufyq+32*2+82] +%endif pmaddubsw xm8, xm7, xm8 +%if %3 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 +%endif pmulhrsw xm8, xm6 pmullw xm8, xm4 pmulhrsw xm8, xm3 - packsswb xm8, xm8 movq xm0, [bufq+32] - punpcklbw xm8, xm0 - pmaddubsw xm8, xm7, xm8 + pcmpgtb xm9, xm12, xm0 + punpcklbw xm9, xm0, xm9 + paddw xm8, xm9 packsswb xm8, xm8 vpblendw xm0, xm8, xm0, 1000b movq [bufq+32], xm0 +%else + pcmpgtb xm9, xm12, xm8 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + pmullw xm10, xm4 + pmullw xm8, xm4 + pmulhrsw xm10, xm3 + pmulhrsw xm8, xm3 + movu xm0, [bufq+64] + pcmpgtb xm9, xm12, xm0 + punpcklbw xm1, xm0, xm9 + punpckhbw xm9, xm0, xm9 + paddw xm1, xm8 + paddw xm9, xm10 + packsswb xm1, xm9 + vpblendw xm0, xm1, xm0, 11000000b + movu [bufq+64], xm0 +%endif add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET @@ -549,27 +627,43 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv pshufd xm5, xm4, q1111 pshufd xm4, xm4, q0000 pmovsxwd xm3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 vpbroadcastd xm7, [pb_1] - vpbroadcastw xm6, [hmul_bits+4] + vpbroadcastw xm6, [hmul_bits+2+%3*2] +%endif vpbroadcastd xm3, xm3 - sub bufq, 82*38+44-(82*3+41) +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 mov mind, -128 mov maxd, 127 .y_loop_ar1: - mov xq, -38 + mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: pmovsxbw xm0, [bufq+xq-82-1] ; top/left +%if %2 movq xm8, [bufyq+xq*2] +%if %3 movq xm9, [bufyq+xq*2+82] +%endif +%endif psrldq xm2, xm0, 2 ; top psrldq xm1, xm0, 4 ; top/right +%if %2 pmaddubsw xm8, xm7, xm8 +%if %3 pmaddubsw xm9, xm7, xm9 paddw xm8, xm9 +%endif pmulhrsw xm8, xm6 +%else + pmovsxbw xm8, [bufyq+xq] +%endif punpcklwd xm0, xm2 punpcklwd xm1, xm8 pmaddwd xm0, xm4 @@ -598,7 +692,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv .x_loop_ar1_end: add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar1 RET @@ -611,8 +705,10 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv pmovsxbw xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0] ; cf0-7 pmovsxbw xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8] ; cf8-12 pinsrw xm9, [base+pw_1], 5 - vpbroadcastw xm7, [base+hmul_bits+4] +%if %2 + vpbroadcastw xm7, [base+hmul_bits+2+%3*2] vpbroadcastd xm6, [base+pb_1] +%endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x pshufd xm12, xm9, q0000 pshufd xm13, xm9, q1111 @@ -621,11 +717,15 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv pshufd xm10, xm8, q2222 pshufd xm9, xm8, q1111 pshufd xm8, xm8, q0000 - sub bufq, 82*38+44-(82*3+41) +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar2: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar2: pmovsxbw xm0, [bufq+xq-82*2-2] ; y=-2,x=[-2,+5] @@ -654,12 +754,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv paddd xm2, xm3 paddd xm2, xm4 +%if %2 movq xm0, [bufyq+xq*2] +%if %3 movq xm3, [bufyq+xq*2+82] +%endif pmaddubsw xm0, xm6, xm0 +%if %3 pmaddubsw xm3, xm6, xm3 paddw xm0, xm3 +%endif pmulhrsw xm0, xm7 +%else + pmovsxbw xm0, [bufyq+xq] +%endif punpcklwd xm0, xm15 pmaddwd xm0, xm14 paddd xm2, xm0 @@ -685,7 +793,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv .x_loop_ar2_end: add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET @@ -730,14 +838,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv mova [rsp+ 9*16], xm3 mova [rsp+10*16], xm4 mova [rsp+11*16], xm5 +%if %2 vpbroadcastd xm13, [base+pb_1] - vpbroadcastw xm15, [base+hmul_bits+4] + vpbroadcastw xm15, [base+hmul_bits+2+%3*2] +%endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x - sub bufq, 82*38+44-(82*3+41) +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*70-(82-3) +%endif add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar3: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar3: movu xm0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] @@ -800,12 +914,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv palignr xm9, xm5, xm2, 10 palignr xm5, xm5, xm2, 12 +%if %2 movq xm1, [bufyq+xq*2] +%if %3 movq xm2, [bufyq+xq*2+82] +%endif pmaddubsw xm1, xm13, xm1 +%if %3 pmaddubsw xm2, xm13, xm2 paddw xm1, xm2 +%endif pmulhrsw xm1, xm15 +%else + pmovsxbw xm1, [bufyq+xq] +%endif punpcklwd xm6, xm7 punpcklwd xm8, xm9 @@ -841,10 +963,15 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv .x_loop_ar3_end: add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 INIT_YMM avx2 cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut @@ -1188,9 +1315,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_v_overlap + btc hd, 16 + jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: @@ -1321,9 +1447,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_hv_overlap + btc hd, 16 + jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: @@ -1334,8 +1459,9 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut .end_hv: RET -cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, h, sby, luma, lstride, uv_pl, is_id +%macro FGUV_FN 3 ; name, ss_hor, ss_ver +cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, h, sby, luma, lstride, uv_pl, is_id pcmpeqw m10, m10 psrld m10, 24 mov r7d, [fg_dataq+FGData.scaling_shift] @@ -1351,7 +1477,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl -%macro FGUV_32x32xN_LOOP 1 ; not-csfl +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap %if %1 @@ -1362,7 +1488,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ vpbroadcastw m15, [fg_dataq+FGData.uv_offset+r7*4] %else vpbroadcastd m14, [pw_1024] +%if %2 vpbroadcastd m15, [pb_23_22] +%else + vpbroadcastd xm15, [pb_27_17_17_27] +%endif %endif mov overlapd, [fg_dataq+FGData.overlap_flag] @@ -1384,7 +1514,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] - lea r14, [lumaq+wq*2] + lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp @@ -1405,8 +1535,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, unused1, unused2, lstride @@ -1415,21 +1545,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov grain_lutq, grain_lutmp %%loop_y: ; src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*2 +0], 1 - vinserti128 m6, [lumaq+lstrideq*2+16], 1 + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + pxor m2, m2 + mova m4, [lumaq] + mova m0, [srcq] +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -1441,6 +1579,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 @@ -1469,8 +1610,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] +%if %2 movu xm3, [grain_lutq+offxyq+ 0] vinserti128 m3, [grain_lutq+offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] +%endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1489,21 +1634,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 +%if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*2 - sub hb, 2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 jg %%loop_y - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r11mp mov dstq, r12mp - lea lumaq, [r14+wq*2] + lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq test overlapd, overlapd @@ -1525,13 +1680,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride - lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx + lea left_offxyd, [offyd+(32>>%2)] ; previous column's offy*stride+offx mov offxd, seed rorx offyd, seed, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, unused1, unused2, lstride @@ -1540,21 +1695,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov grain_lutq, grain_lutmp %%loop_y_h_overlap: ; src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*2 +0], 1 - vinserti128 m6, [lumaq+lstrideq*2+16], 1 + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -1566,6 +1729,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 @@ -1594,6 +1760,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ punpcklbw m0, m2 ; m0-1: src as word ; grain = grain_lut[offy+y][offx+x] +%if %2 %if %1 vpbroadcastd m6, [pb_23_22] ; FIXME %endif @@ -1613,6 +1780,25 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pcmpeqw m6, m6 ; FIXME psrldq m6, 15 ; FIXME vpblendvb m3, m3, m4, m6 +%else +%if %1 + vpbroadcastd xm6, [pb_27_17_17_27] +%endif + movu m3, [grain_lutq+offxyq] + movd xm4, [grain_lutq+left_offxyq] + punpcklbw xm4, xm3 +%if %1 + pmaddubsw xm4, xm6, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm6, xm6 + psrldq xm6, 14 + vpblendvb m3, m3, m4, m6 +%endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1631,21 +1817,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pminsw m0, m12 pminsw m1, m12 packuswb m0, m1 +%if %2 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 +%else + mova [dstq], m0 +%endif +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*2 - sub hb, 2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82*(1+%2) + sub hb, 1+%2 jg %%loop_y_h_overlap - add wq, 16 + add wq, 32>>%2 jge %%end mov srcq, r11mp mov dstq, r12mp - lea lumaq, [r14+wq*2] + lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq @@ -1678,7 +1874,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov lumaq, r9mp lea r12, [srcq+wq] lea r13, [dstq+wq] - lea r14, [lumaq+wq*2] + lea r14, [lumaq+wq*(1+%2)] mov r11mp, r12 mov r12mp, r13 mov lstrideq, r10mp @@ -1705,9 +1901,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, overlap, top_offxy, unused, lstride @@ -1717,23 +1913,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov hd, hm mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif %%loop_y_v_overlap: ; src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*2 +0], 1 - vinserti128 m6, [lumaq+lstrideq*2+16], 1 + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -1745,6 +1952,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 @@ -1768,11 +1978,42 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packusdw m8, m4 packusdw m5, m6 +%if %2 ; unpack chroma_source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word +%endif ; grain = grain_lut[offy+y][offx+x] +%if %3 == 0 +%if %2 + mova m6, [pb_8x_27_17_8x_17_27] + movu xm3, [grain_lutq+offxyq] + movu xm4, [grain_lutq+top_offxyq] + vinserti128 m3, [grain_lutq+offxyq+82], 1 + vinserti128 m4, [grain_lutq+top_offxyq+82], 1 +%else + movu m3, [grain_lutq+offxyq] + movu m4, [grain_lutq+top_offxyq] +%endif + punpckhbw m9, m4, m3 + punpcklbw m4, m3 +%if %2 + pmaddubsw m9, m6, m9 + pmaddubsw m4, m6, m4 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m4, m1, m4 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m4, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m4, m14 +%endif + packsswb m3, m4, m9 +%else %if %1 vpbroadcastd m6, [pb_23_22] %endif @@ -1792,6 +2033,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ vpermq m4, m4, q3120 ; only interpolate first line, insert second line unmodified vinserti128 m3, m4, [grain_lutq+offxyq+82], 1 +%endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1803,6 +2045,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) +%if %2 paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 @@ -1812,21 +2055,46 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word - sub hb, 2 + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif + + sub hb, 1+%2 jl %%end_y_v_overlap +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_v_overlap +%endif jmp %%loop_y %%end_y_v_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r11mp mov dstq, r12mp - lea lumaq, [r14+wq*2] + lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq @@ -1851,15 +2119,15 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride - lea topleft_offxyq, [top_offxyq+16] - lea left_offxyq, [offyq+16] + lea topleft_offxyq, [top_offxyq+(32>>%2)] + lea left_offxyq, [offyq+(32>>%2)] rorx offyd, seed, 8 rorx offxd, seed, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride @@ -1869,23 +2137,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov hd, hm mov grain_lutq, grain_lutmp +%if %2 == 0 + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27] +%endif %%loop_y_hv_overlap: ; src +%if %2 mova xm4, [lumaq+lstrideq*0+ 0] mova xm6, [lumaq+lstrideq*0+16] mova xm0, [srcq] vpbroadcastd m7, [pb_1] - vinserti128 m4, [lumaq+lstrideq*2 +0], 1 - vinserti128 m6, [lumaq+lstrideq*2+16], 1 + vinserti128 m4, [lumaq+lstrideq*(1+%3) +0], 1 + vinserti128 m6, [lumaq+lstrideq*(1+%3)+16], 1 vinserti128 m0, [srcq+strideq], 1 pxor m2, m2 pmaddubsw m4, m7 pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -1897,6 +2176,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif punpckhwd m5, m4, m2 @@ -1920,44 +2202,94 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packusdw m8, m4 packusdw m5, m6 +%if %2 ; unpack chroma source punpckhbw m1, m0, m2 punpcklbw m0, m2 ; m0-1: src as word +%endif ; grain = grain_lut[offy+y][offx+x] %if %1 +%if %2 vpbroadcastd m9, [pb_23_22] +%else + vpbroadcastd xm9, [pb_27_17_17_27] %endif +%endif + +%if %2 movu xm3, [grain_lutq+offxyq] +%if %3 movq xm6, [grain_lutq+top_offxyq] +%else + movu xm6, [grain_lutq+top_offxyq] +%endif vinserti128 m3, [grain_lutq+offxyq+82], 1 +%if %3 vinserti128 m6, [grain_lutq+top_offxyq+8], 1 +%else + vinserti128 m6, [grain_lutq+top_offxyq+82], 1 +%endif +%else + movu m3, [grain_lutq+offxyq] + movu m6, [grain_lutq+top_offxyq] +%endif movd xm4, [grain_lutq+left_offxyq] movd xm7, [grain_lutq+topleft_offxyq] +%if %2 vinserti128 m4, [grain_lutq+left_offxyq+82], 1 +%if %3 == 0 + vinserti128 m7, [grain_lutq+topleft_offxyq+82], 1 +%endif +%endif + ; do h interpolation first (so top | top/left -> top, left | cur -> cur) +%if %2 punpcklbw m4, m3 +%if %3 punpcklbw xm7, xm6 +%else + punpcklbw m7, m6 +%endif + punpcklwd m4, m7 %if %1 pmaddubsw m4, m9, m4 - pmaddubsw xm7, xm9, xm7 pmulhrsw m4, [pw_1024] - pmulhrsw xm7, [pw_1024] %else pmaddubsw m4, m15, m4 - pmaddubsw xm7, xm15, xm7 pmulhrsw m4, m14 - pmulhrsw xm7, xm14 %endif packsswb m4, m4 - packsswb xm7, xm7 pcmpeqw m9, m9 ; this is kind of ugly psrldq m9, 15 vpblendvb m3, m3, m4, m9 - shufpd m9, m9, m9, 1110b - vpblendvb m6, m6, m7, m9 - vpermq m9, m3, q3120 + psrldq m4, 1 +%if %3 + shufpd m9, m9, m9, 1110b ; clear upper lane +%endif + vpblendvb m6, m6, m4, m9 +%else + punpcklbw xm4, xm3 + punpcklbw xm7, xm6 + punpckldq xm4, xm7 +%if %1 + pmaddubsw xm4, xm9, xm4 + pmulhrsw xm4, [pw_1024] +%else + pmaddubsw xm4, xm15, xm4 + pmulhrsw xm4, xm14 +%endif + packsswb xm4, xm4 + pcmpeqw xm9, xm9 ; this is kind of ugly + psrldq xm9, 14 + vpblendvb m3, m3, m4, m9 + psrldq xm4, 2 + vpblendvb m6, m6, m4, m9 +%endif + ; followed by v interpolation (top | cur -> cur) +%if %3 + vpermq m9, m3, q3120 punpcklbw m6, m9 %if %1 vpbroadcastd m9, [pb_23_22] @@ -1970,6 +2302,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packsswb m6, m6 vpermq m6, m6, q3120 vpblendd m3, m3, m6, 00001111b +%else + punpckhbw m9, m6, m3 + punpcklbw m6, m3 +%if %2 + mova m3, [pb_8x_27_17_8x_17_27] + pmaddubsw m9, m3, m9 + pmaddubsw m6, m3, m6 +%else + pmaddubsw m9, m1, m9 + pmaddubsw m6, m1, m6 +%endif +%if %1 + pmulhrsw m9, [pw_1024] + pmulhrsw m6, [pw_1024] +%else + pmulhrsw m9, m14 + pmulhrsw m6, m14 +%endif + packsswb m3, m6, m9 +%endif pcmpgtb m7, m2, m3 punpcklbw m2, m3, m7 punpckhbw m3, m7 @@ -1981,6 +2333,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmulhrsw m3, m11 ; dst = clip_pixel(src, noise) +%if %2 paddw m0, m2 paddw m1, m3 pmaxsw m0, m13 @@ -1990,20 +2343,47 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m0, m1 mova [dstq], xm0 vextracti128 [dstq+strideq], m0, 1 +%else + pxor m6, m6 + punpckhbw m9, m0, m6 + punpcklbw m0, m6 ; m0-1: src as word + paddw m0, m2 + paddw m9, m3 + pmaxsw m0, m13 + pmaxsw m9, m13 + pminsw m0, m12 + pminsw m9, m12 + packuswb m0, m9 + mova [dstq], m0 +%endif +%if %2 lea srcq, [srcq+strideq*2] lea dstq, [dstq+strideq*2] - lea lumaq, [lumaq+lstrideq*4] - add grain_lutq, 82*2 - sub hb, 2 + lea lumaq, [lumaq+lstrideq*(2<<%3)] +%else + add srcq, strideq + add dstq, strideq + add lumaq, lstrideq +%endif + add grain_lutq, 82<<%2 + sub hb, 1+%2 +%if %2 jg %%loop_y_h_overlap +%else + je %%end_y_hv_overlap + vbroadcasti128 m1, [pb_8x_27_17_8x_17_27+16] + btc hd, 16 + jnc %%loop_y_hv_overlap + jmp %%loop_y_h_overlap +%endif %%end_y_hv_overlap: - add wq, 16 + add wq, 32>>%2 jge %%end_hv mov srcq, r11mp mov dstq, r12mp - lea lumaq, [r14+wq*2] + lea lumaq, [r14+wq*(1+%2)] add srcq, wq add dstq, wq jmp %%loop_x_hv_overlap @@ -2012,8 +2392,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ RET %endmacro - FGUV_32x32xN_LOOP 1 + %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: - FGUV_32x32xN_LOOP 0 + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 +FGUV_FN 422, 1, 0 +FGUV_FN 444, 0, 0 %endif ; ARCH_X86_64 diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c index 30bb52d06..25e8ef99e 100644 --- a/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c @@ -30,13 +30,21 @@ decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3); decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3); decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3); decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3); decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2); decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2); +decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2); decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2); decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2); +decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2); COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) { const unsigned flags = dav1d_get_cpu_flags(); @@ -46,8 +54,12 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c #if BITDEPTH == 8 c->generate_grain_y = dav1d_generate_grain_y_ssse3; c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3; c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3; #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return; @@ -55,7 +67,11 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c #if BITDEPTH == 8 && ARCH_X86_64 c->generate_grain_y = dav1d_generate_grain_y_avx2; c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2; + c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2; c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2; c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2; + c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2; #endif } diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm index 6402cec51..8212846f2 100644 --- a/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm +++ b/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm @@ -60,6 +60,8 @@ pw_1: dw 1 JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3 JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3 +JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3 struc FGData .seed: resd 1 @@ -502,8 +504,9 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data jg .y_loop_ar3 RET +%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y INIT_XMM ssse3 -cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv +cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv movifnidn r2, r2mp movifnidn r3, r3mp LEA r4, $$ @@ -520,15 +523,21 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u pshuflw m6, m6, q0000 pshuflw m0, m0, q0000 lea r6, [base+gaussian_sequence] +%if %2 %if ARCH_X86_64 - mov r7d, 38 + mov r7d, 73-35*%3 %else - mov r3mp, 38 + mov r3mp, 73-35*%3 %endif add bufq, 44 .loop_y: mov r5, -44 .loop_x: +%else + mov r5, -82*73 + sub bufq, r5 +.loop: +%endif pand m2, m0, m1 psrlw m3, m2, 10 por m2, m3 ; bits 0xf, 0x1e, 0x3c and 0x78 are set @@ -577,6 +586,7 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u packsswb m3, m3 movd [bufq+r5], m3 add r5, 4 +%if %2 jl .loop_x add bufq, 82 %if ARCH_X86_64 @@ -585,6 +595,9 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u dec r3mp %endif jg .loop_y +%else + jl .loop +%endif %if ARCH_X86_32 mov r2, r2mp @@ -592,8 +605,8 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u ; auto-regression code movsxd r5, [fg_dataq+FGData.ar_coeff_lag] - movsxd r5, [base+generate_grain_uv_420_ssse3_table+r5*4] - lea r5, [r5+base+generate_grain_uv_420_ssse3_table] + movsxd r5, [base+generate_grain_uv_%1_ssse3_table+r5*4] + lea r5, [r5+base+generate_grain_uv_%1_ssse3_table] jmp r5 .ar0: @@ -607,79 +620,130 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m5, [fg_dataq+FGData.ar_coeffs_uv+uvq] movd m4, [base+hmul_bits+shiftq*2] - movd m1, [base+byte_blend] - DEFINE_ARGS buf, bufy, h + DEFINE_ARGS buf, bufy, h, x pxor m0, m0 pcmpgtb m0, m5 punpcklbw m5, m0 movd m7, [base+pb_1] - movd m6, [base+hmul_bits+4] +%if %2 + movd m6, [base+hmul_bits+2+%3*2] +%endif pshuflw m5, m5, q0000 pshuflw m4, m4, q0000 pshufd m7, m7, q0000 +%if %2 pshuflw m6, m6, q0000 +%endif punpcklqdq m5, m5 punpcklqdq m4, m4 +%if %2 punpcklqdq m6, m6 - punpcklbw m1, m1 +%endif + pcmpeqw m1, m1 + pslldq m1, 12>>%2 SCRATCH 1, 8, 0 SCRATCH 4, 9, 1 - sub bufq, 82*38+82-(82*3+41) +%if %2 + sub bufq, 82*(73-35*%3)+82-(82*3+41) +%else + sub bufq, 82*70-3 +%endif add bufyq, 3+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar0: + xor xd, xd +.x_loop_ar0: ; first 32 pixels - movu m1, [bufyq] - movu m2, [bufyq+82] - movu m3, [bufyq+16] - movu m4, [bufyq+82+16] +%if %2 + movu m1, [bufyq+xq*2] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif + movu m3, [bufyq+xq*2+16] +%if %3 + movu m4, [bufyq+xq*2+82+16] +%endif pmaddubsw m0, m7, m1 +%if %3 pmaddubsw m1, m7, m2 +%endif pmaddubsw m2, m7, m3 +%if %3 pmaddubsw m3, m7, m4 paddw m0, m1 paddw m2, m3 +%endif pmulhrsw m0, m6 pmulhrsw m2, m6 +%else + movu m0, [bufyq+xq] + pxor m6, m6 + pcmpgtb m6, m0 + punpckhbw m2, m0, m6 + punpcklbw m0, m6 +%endif pmullw m0, m5 pmullw m2, m5 pmulhrsw m0, m9 pmulhrsw m2, m9 + movu m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpckhbw m3, m1, m4 +%if %2 + punpcklbw m1, m4 + paddw m2, m3 + paddw m0, m1 +%else + punpcklbw m6, m1, m4 + paddw m2, m3 + paddw m0, m6 +%endif packsswb m0, m2 - movu m1, [bufq] - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m1, m7, m2 - pmaddubsw m2, m7, m0 - packsswb m2, m1 - movu [bufq], m2 - add bufyq, 32 - add bufq, 16 - xor hd, 0x10000 - test hd, 0x10000 - jnz .y_loop_ar0 - - ; last 6 pixels - movu m1, [bufyq] - movu m2, [bufyq+82] +%if %2 + movu [bufq+xq], m0 + add xd, 16 + cmp xd, 32 + jl .x_loop_ar0 + + ; last 6/12 pixels + movu m1, [bufyq+xq*(1+%2)] +%if %3 + movu m2, [bufyq+xq*2+82] +%endif pmaddubsw m0, m7, m1 +%if %3 pmaddubsw m1, m7, m2 paddw m0, m1 +%endif pmulhrsw m0, m6 pmullw m0, m5 pmulhrsw m0, m9 + movq m1, [bufq+xq] + pxor m4, m4 + pcmpgtb m4, m1 + punpcklbw m2, m1, m4 + paddw m0, m2 packsswb m0, m0 - movq m1, [bufq] - punpcklbw m0, m1 - pmaddubsw m2, m7, m0 - packsswb m2, m2 - pandn m0, m8, m2 + pandn m2, m8, m0 + pand m1, m8 + por m2, m1 + movq [bufq+xq], m2 +%else + add xd, 16 + cmp xd, 80 + je .y_loop_final_ar0 + movu [bufq+xq-16], m0 + jmp .x_loop_ar0 +.y_loop_final_ar0: + pandn m2, m8, m0 pand m1, m8 - por m0, m1 - movq [bufq], m0 + por m2, m1 + movu [bufq+xq-16], m2 +%endif - add bufq, 82-32 - add bufyq, 82*2-64 + add bufq, 82 + add bufyq, 82<<%3 dec hd jg .y_loop_ar0 RET @@ -706,8 +770,10 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u %endif mov shiftd, [fg_dataq+FGData.ar_coeff_shift] movd m3, [base+round_vals+shiftq*2-12] ; rnd +%if %2 movd m7, [base+pb_1] - movd m6, [base+hmul_bits+4] + movd m6, [base+hmul_bits+2+%3*2] +%endif psrldq m4, 1 %if ARCH_X86_32 DEFINE_ARGS buf, shift, val0, val3, min, max, x @@ -718,40 +784,64 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u %endif pxor m5, m5 punpcklwd m3, m5 +%if %2 punpcklwd m6, m6 +%endif pcmpgtb m5, m4 punpcklbw m4, m5 pshufd m5, m4, q1111 pshufd m4, m4, q0000 pshufd m3, m3, q0000 +%if %2 pshufd m7, m7, q0000 pshufd m6, m6, q0000 - sub bufq, 82*38+44-(82*3+41) + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif %if ARCH_X86_32 add r1mp, 79+82*3 - mov r0mp, 35 + mov r0mp, 70-35*%3 %else add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 %endif mov mind, -128 mov maxd, 127 .y_loop_ar1: - mov xq, -38 + mov xq, -(76>>%2) movsx val3d, byte [bufq+xq-1] .x_loop_ar1: +%if %2 %if ARCH_X86_32 mov r2, r1mp movq m0, [r2+xq*2] +%if %3 movq m1, [r2+xq*2+82] +%endif %else movq m0, [bufyq+xq*2] +%if %3 movq m1, [bufyq+xq*2+82] +%endif %endif pmaddubsw m2, m7, m0 +%if %3 pmaddubsw m0, m7, m1 paddw m2, m0 +%endif pmulhrsw m2, m6 +%else +%if ARCH_X86_32 + mov r2, r1mp + movd m2, [r2+xq] +%else + movd m2, [bufyq+xq] +%endif + pxor m0, m0 + pcmpgtb m0, m2 + punpcklbw m2, m0 +%endif movq m0, [bufq+xq-82-1] ; top/left pxor m1, m1 @@ -792,10 +882,10 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u .x_loop_ar1_end: add bufq, 82 %if ARCH_X86_32 - add r1mp, 82*2 + add r1mp, 82<<%3 dec r0mp %else - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd %endif jg .y_loop_ar1 @@ -837,16 +927,20 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u SCRATCH 5, 13, 5 SCRATCH 6, 14, 6 SCRATCH 7, 15, 7 - movd m7, [base+hmul_bits+4] +%if %2 + movd m7, [base+hmul_bits+2+%3*2] movd m6, [base+pb_1] punpcklwd m7, m7 pshufd m6, m6, q0000 pshufd m7, m7, q0000 - sub bufq, 82*38+44-(82*3+41) + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar2: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar2: pxor m2, m2 @@ -879,12 +973,23 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u paddd m2, m3 paddd m2, m4 - movq m0, [bufyq+xq*2] +%if %2 + movq m1, [bufyq+xq*2] +%if %3 movq m3, [bufyq+xq*2+82] - pmaddubsw m1, m6, m0 - pmaddubsw m0, m6, m3 +%endif + pmaddubsw m0, m6, m1 +%if %3 + pmaddubsw m1, m6, m3 paddw m0, m1 +%endif pmulhrsw m0, m7 +%else + movd m0, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m0 + punpcklbw m0, m1 +%endif punpcklwd m0, m15 pmaddwd m0, m14 paddd m2, m0 @@ -914,7 +1019,7 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u .x_loop_ar2_end: add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar2 RET @@ -977,24 +1082,36 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u SCRATCH 5, 12, 11 movd m2, [base+round_vals-12+shiftq*2] +%if %2 movd m1, [base+pb_1] - movd m3, [base+hmul_bits+4] + movd m3, [base+hmul_bits+2+%3*2] +%endif pxor m0, m0 punpcklwd m2, m0 +%if %2 punpcklwd m3, m3 +%endif pshufd m2, m2, q0000 +%if %2 pshufd m1, m1, q0000 pshufd m3, m3, q0000 SCRATCH 1, 13, 12 +%endif SCRATCH 2, 14, 13 +%if %2 SCRATCH 3, 15, 14 +%endif DEFINE_ARGS buf, bufy, fg_data, h, unused, x - sub bufq, 82*38+44-(82*3+41) +%if %2 + sub bufq, 82*(73-35*%3)+44-(82*3+41) +%else + sub bufq, 82*69+3 +%endif add bufyq, 79+82*3 - mov hd, 35 + mov hd, 70-35*%3 .y_loop_ar3: - mov xq, -38 + mov xq, -(76>>%2) .x_loop_ar3: movu m0, [bufq+xq-82*3-3] ; y=-3,x=[-3,+12] @@ -1058,12 +1175,23 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u paddd m3, m5 paddd m0, m3 +%if %2 movq m1, [bufyq+xq*2] +%if %3 movq m3, [bufyq+xq*2+82] - pmaddubsw m5, m13, m1 - pmaddubsw m7, m13, m3 +%endif + pmaddubsw m7, m13, m1 +%if %3 + pmaddubsw m5, m13, m3 paddw m7, m5 +%endif pmulhrsw m7, m15 +%else + movd m7, [bufyq+xq] + pxor m1, m1 + pcmpgtb m1, m7 + punpcklbw m7, m1 +%endif psrldq m1, m2, 4 psrldq m3, m2, 6 @@ -1110,10 +1238,15 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u .x_loop_ar3_end: add bufq, 82 - add bufyq, 82*2 + add bufyq, 82<<%3 dec hd jg .y_loop_ar3 RET +%endmacro + +generate_grain_uv_fn 420, 1, 1 +generate_grain_uv_fn 422, 1, 0 +generate_grain_uv_fn 444, 0, 0 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg %assign %%idx 0 @@ -1359,13 +1492,11 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp - xor r8mp, 4 - test r8mp, 4 %else lea srcq, [src_bakq+wq] - test srcq, 16 ; this relies on buffer alignment... %endif - jz .next_blk + btc dword r8m, 2 + jc .next_blk add offxyd, 16 test dword r8m, 2 ; r8m & 2 = have_top_overlap @@ -1507,11 +1638,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut %if ARCH_X86_32 mov srcq, r1m add srcq, r4m - xor r8mp, 4 %else lea srcq, [src_bakq+wq] %endif - ; assert(srcq & 16) != 0 + xor dword r8m, 4 add offxyd, 16 ; since this half-block had left-overlap, the next does not @@ -1712,9 +1842,8 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut jz .end_y_v_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_v_overlap + btc hd, 16 + jnc .loop_y_v_overlap jmp .loop_y .end_y_v_overlap: @@ -1727,13 +1856,11 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut %if ARCH_X86_32 mov srcq, r1mp add srcq, r4mp - xor r8mp, 4 - test r8mp, 4 %else lea srcq, [src_bakq+wq] - test srcq, 16 %endif - jz .loop_x_hv_overlap + btc dword r8m, 2 + jc .loop_x_hv_overlap add offxyd, 16 %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 @@ -1915,9 +2042,8 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut jz .end_y_hv_overlap ; 2 lines get vertical overlap, then fall back to non-overlap code for ; remaining (up to) 30 lines - xor hd, 0x10000 - test hd, 0x10000 - jnz .loop_y_hv_overlap + btc hd, 16 + jnc .loop_y_hv_overlap jmp .loop_y_h_overlap .end_y_hv_overlap: @@ -1930,11 +2056,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut %if ARCH_X86_32 mov srcq, r1m add srcq, r4m - xor r8mp, 4 %else lea srcq, [src_bakq+wq] %endif - ; assert(srcq & 16) != 0 + xor dword r8m, 4 add offxyd, 16 %if ARCH_X86_32 add dword [rsp+6*mmsize+1*gprsize], 16 @@ -1946,13 +2071,14 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut .end_hv: RET +%macro FGUV_FN 3 ; name, ss_hor, ss_ver INIT_XMM ssse3 %if ARCH_X86_32 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h, ; sby, luma, lstride, uv_pl, is_id) %if STACK_ALIGNMENT < mmsize DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8 -cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ +cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ tmp, src, scaling, h, fg_data, picptr, unused mov r0, r0m mov r1, r2m @@ -1975,7 +2101,7 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \ mov [rsp+8*mmsize+13*gprsize], r2 mov [rsp+8*mmsize+14*gprsize], r4 %else -cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ +cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ tmp, src, scaling, h, fg_data, picptr, unused %endif mov srcq, srcm @@ -2000,13 +2126,13 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \ %define base r5-pb_mask mov r5m, r5 %else -cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ - grain_lut, tmp, sby, luma, lstride, uv_pl, is_id +cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ + grain_lut, tmp, sby, luma, lstride, uv_pl, is_id lea r8, [pb_mask] %define base r8-pb_mask %endif mov r6d, [fg_dataq+FGData.scaling_shift] - movd m2, [base+byte_blend+3] + pcmpeqw m2, m2 movd m3, [base+mul_bits+r6*2-14] mov r6d, [fg_dataq+FGData.clip_to_restricted_range] lea tmpd, [r6d*2] @@ -2018,6 +2144,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ movd m5, [base+min+r6*2] cmovne r6d, tmpd movd m4, [base+max+r6*2] + psrldq m2, 14+%2 punpcklwd m3, m3 punpcklwd m5, m5 punpcklwd m4, m4 @@ -2032,7 +2159,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0 jne .csfl -%macro FGUV_32x32xN_LOOP 1 ; not-csfl +%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver %if ARCH_X86_32 DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap %else @@ -2058,10 +2185,18 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ test overlapd, overlapd jz %%no_vertical_overlap %if ARCH_X86_32 +%if %2 movd m1, [base+pb_23_22] +%else + movd m1, [base+pb_27_17_17_27] +%endif mova m0, [base+pw_1024] %else +%if %2 movd m1, [pb_23_22] +%else + movd m1, [pb_27_17_17_27] +%endif mova m0, [pw_1024] %endif pshufd m1, m1, q0000 @@ -2091,7 +2226,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %define luma_bakq lumaq mov wq, r4m +%if %3 shl r10mp, 1 +%endif %else DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak @@ -2101,7 +2238,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov lumaq, r9mp lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*2] + lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 @@ -2112,7 +2249,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq - mov r10mp, strideq + mov r12mp, strideq %endif %%loop_x: @@ -2141,8 +2278,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ror offyd, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut @@ -2151,6 +2288,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ h, offxy, see, overlap, unused1, unused2, lstride, luma_bak %endif +%%loop_x_odd: mov hd, r7m mov grain_lutq, grain_lutmp %%loop_y: @@ -2158,6 +2296,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %if ARCH_X86_32 mov lumaq, r9mp %endif +%if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] @@ -2175,9 +2314,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -2189,6 +2339,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif ; scaling[luma_src] @@ -2239,8 +2392,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ add srcq, r2mp ; we already incremented lumaq above %else - add srcq, r10mp + add srcq, r12mp +%if %3 lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif %endif add grain_lutq, 82 dec hw @@ -2259,11 +2416,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %else mov srcq, r11mp %endif - lea lumaq, [luma_bakq+wq*2] + lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq +%endif +%if %2 == 0 + ; adjust top_offxy +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jc %%loop_x_even + test dword r8m, 2 + jz %%loop_x_odd + jmp %%loop_x_odd_v_overlap +%%loop_x_even: %endif test dword r8m, 1 jz %%loop_x @@ -2275,8 +2447,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ; horizontal overlap (without vertical overlap) %%loop_x_h_overlap: %if ARCH_X86_32 +%if %2 lea r6, [offxyd+16] mov [rsp+8*mmsize+0*gprsize], r6 +%else + mov [rsp+8*mmsize+0*gprsize], offxyd +%endif DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut @@ -2285,7 +2461,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ offx, offy, see, left_offxy, unused1, unused2, lstride +%if %2 lea left_offxyd, [offyd+16] ; previous column's offy*stride+offx +%else + mov left_offxyd, offyd +%endif %endif mov r6d, seed or seed, 0xEFF4 @@ -2310,8 +2490,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ror offyd, 8 shr offxd, 12 and offyd, 0xf - imul offyd, 82 - lea offyq, [offyq+offxq+498] ; offy*stride+offx + imul offyd, 164>>%3 + lea offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)] ; offy*stride+offx %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut @@ -2327,6 +2507,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %if ARCH_X86_32 mov lumaq, r9mp %endif +%if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] @@ -2344,9 +2525,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -2358,6 +2550,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif ; scaling[luma_src] @@ -2422,8 +2617,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ add srcq, r2mp ; lumaq has already been incremented above %else - add srcq, r10mp + add srcq, r12mp +%if %3 lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif %endif add grain_lutq, 82 dec hw @@ -2442,17 +2641,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %else mov srcq, r11mp %endif - lea lumaq, [luma_bakq+wq*2] + lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif +%if %2 == 0 + xor dword r8m, 4 + ; adjust top_offxyd +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add r11d, 16 +%endif + add offxyd, 16 +%endif ; r8m = sbym test dword r8m, 2 +%if %2 jne %%loop_x_hv_overlap jmp %%loop_x_h_overlap +%else + jne %%loop_x_odd_v_overlap + jmp %%loop_x_odd +%endif %%end: RET @@ -2487,7 +2701,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov r3m, seed mov wq, r4m +%if %3 shl r10mp, 1 +%endif %else xor seed, sbyd ; (cur_seed << 16) | top_seed @@ -2499,7 +2715,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov lumaq, r9mp lea src_bakq, [srcq+wq] - lea luma_bakq, [lumaq+wq*2] + lea luma_bakq, [lumaq+wq*(1+%2)] neg wq sub r0mp, srcq %if ARCH_X86_32 @@ -2510,7 +2726,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2 %else mov r11mp, src_bakq - mov r10mp, strideq + mov r12mp, strideq %endif %%loop_x_v_overlap: @@ -2549,9 +2765,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy @@ -2568,12 +2784,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut %endif +%%loop_x_odd_v_overlap: mov hd, r7m mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m + mova m1, [base+pb_27_17] +%else + mova m1, [pb_27_17] +%endif %%loop_y_v_overlap: %if ARCH_X86_32 mov lumaq, r9mp %endif +%if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] @@ -2591,9 +2815,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -2605,6 +2840,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif ; scaling[luma_src] @@ -2615,10 +2853,10 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ vpgatherdw m7, m4, scalingq, r12, r2 vpgatherdw m5, m6, scalingq, r12, r2 %endif - pcmpeqw m1, m1 - psrlw m1, 8 - pand m7, m1 - pand m5, m1 + pcmpeqw m4, m4 + psrlw m4, 8 + pand m7, m4 + pand m5, m4 ; grain = grain_lut[offy+y][offx+x] movu m3, [grain_lutq+offxyq] @@ -2628,17 +2866,22 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %else movu m4, [grain_lutq+top_offxyq] %endif - punpckhbw m1, m4, m3 + punpckhbw m6, m4, m3 punpcklbw m4, m3 - pmaddubsw m2, m9, m1 +%if %3 + pmaddubsw m2, m9, m6 pmaddubsw m3, m9, m4 +%else + pmaddubsw m2, m1, m6 + pmaddubsw m3, m1, m4 +%endif pmulhrsw m2, m8 pmulhrsw m3, m8 packsswb m3, m2 - pxor m1, m1 - pcmpgtb m1, m3 - punpcklbw m2, m3, m1 - punpckhbw m3, m1 + pxor m6, m6 + pcmpgtb m6, m3 + punpcklbw m2, m3, m6 + punpckhbw m3, m6 ; noise = round2(scaling[luma_src] * grain, scaling_shift) pmullw m2, m7 @@ -2648,7 +2891,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ; unpack chroma_source pxor m4, m4 - punpckhbw m1, m0, m4 + punpckhbw m6, m0, m4 punpcklbw m0, m4 ; m0-1: src as word %if ARCH_X86_32 @@ -2657,12 +2900,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ; dst = clip_pixel(src, noise) paddw m0, m2 - paddw m1, m3 + paddw m6, m3 pmaxsw m0, m13 - pmaxsw m1, m13 + pmaxsw m6, m13 pminsw m0, m12 - pminsw m1, m12 - packuswb m0, m1 + pminsw m6, m12 + packuswb m0, m6 movifnidn dstq, dstmp mova [dstq+srcq], m0 @@ -2672,10 +2915,24 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ add srcq, r2mp ; lumaq has already been incremented above %else - add srcq, r10mp + add srcq, r12mp +%if %3 lea lumaq, [lumaq+lstrideq*2] +%else + add lumaq, lstrideq +%endif %endif add grain_lutq, 82 +%if %3 == 0 + btc hd, 16 +%if ARCH_X86_32 + mov r5, r5m + mova m1, [base+pb_17_27] +%else + mova m1, [pb_17_27] +%endif + jnc %%loop_y_v_overlap +%endif jmp %%loop_y %%end_y_v_overlap: @@ -2692,25 +2949,40 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %else mov srcq, r11mp %endif - lea lumaq, [luma_bakq+wq*2] + lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif +%if %2 ; since fg_dataq.overlap is guaranteed to be set, we never jump ; back to .loop_x_v_overlap, and instead always fall-through to ; h+v overlap +%else +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + btc dword r8m, 2 + jnc %%loop_x_odd_v_overlap +%endif %%loop_x_hv_overlap: %if ARCH_X86_32 DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused mov r6, [rsp+8*mmsize+1*gprsize] +%if %2 lea r0, [r3d+16] add r6, 16 mov [rsp+8*mmsize+0*gprsize], r0 ; left_offxy +%else + mov [rsp+8*mmsize+0*gprsize], r3 ; left_offxy +%endif mov [rsp+8*mmsize+2*gprsize], r6 ; topleft_offxy DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused @@ -2721,8 +2993,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \ tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride +%if %2 lea topleft_offxyq, [top_offxyq+16] lea left_offxyq, [offxyq+16] +%else + mov topleft_offxyq, top_offxyq + mov left_offxyq, offxyq +%endif ; we assume from the block above that bits 8-15 of tmpd are zero'ed %endif @@ -2756,9 +3033,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ror offxd, 12 and offyd, 0xf000f and offxd, 0xf000f - imul offyd, 82 + imul offyd, 164>>%3 ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy - lea offyq, [offyq+offxq+0x10001*498+16*82] + lea offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82] %if ARCH_X86_32 DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut @@ -2775,6 +3052,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov hd, r7m mov grain_lutq, grain_lutmp +%if ARCH_X86_32 + mov r5, r5m + mova m3, [base+pb_27_17] +%else + mova m3, [pb_27_17] +%endif %%loop_y_hv_overlap: ; src %if ARCH_X86_32 @@ -2782,6 +3065,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ mov lumaq, r9mp %endif +%if %2 mova m4, [lumaq+ 0] mova m6, [lumaq+16] mova m0, [srcq] @@ -2799,9 +3083,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ pmaddubsw m6, m7 pavgw m4, m2 pavgw m6, m2 +%else + mova m4, [lumaq] + mova m0, [srcq] +%if ARCH_X86_32 + add lumaq, r10mp + mov r9mp, lumaq +%endif + pxor m2, m2 +%endif %if %1 +%if %2 packuswb m4, m6 ; luma +%endif punpckhbw m6, m4, m0 punpcklbw m4, m0 ; { luma, chroma } pmaddubsw m6, m14 @@ -2813,6 +3108,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ packuswb m4, m6 ; pack+unpack = clip punpckhbw m6, m4, m2 punpcklbw m4, m2 +%elif %2 == 0 + punpckhbw m6, m4, m2 + punpcklbw m4, m2 %endif ; scaling[src] @@ -2821,8 +3119,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ vpgatherdw m5, m6, scalingq, r0, r5 %else movd m1, [grain_lutq+topleft_offxyq] +%if %3 vpgatherdw m7, m4, scalingq, r2, r12 vpgatherdw m5, m6, scalingq, r2, r12 +%else + vpgatherdw m7, m4, scalingq, r2, r13 + vpgatherdw m5, m6, scalingq, r2, r13 +%endif %endif pcmpeqw m2, m2 psrlw m2, 8 @@ -2836,7 +3139,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ movd m1, [grain_lutq+r0] mov r0, [rsp+8*mmsize+0*gprsize] ; left_offxy %endif - movu m3, [grain_lutq+offxyq] + movu m2, [grain_lutq+offxyq] %if ARCH_X86_32 movu m6, [grain_lutq+r5] movd m4, [grain_lutq+r0] @@ -2846,23 +3149,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %endif ; do h interpolation first (so top | top/left -> top, left | cur -> cur) punpcklbw m1, m6 - punpcklbw m4, m3 + punpcklbw m4, m2 +%if %2 punpcklwd m4, m1 +%else + punpckldq m4, m1 +%endif pmaddubsw m1, m9, m4 pmulhrsw m1, m8 packsswb m1, m1 - pandn m4, m10, m3 - pandn m3, m10, m6 - psrldq m6, m1, 1 + pandn m4, m10, m2 + pandn m2, m10, m6 + psrldq m6, m1, 2-%2 pand m1, m10 pand m6, m10 por m4, m1 - por m3, m6 + por m2, m6 ; followed by v interpolation (top | cur -> cur) - punpckhbw m1, m3, m4 - punpcklbw m3, m4 + punpckhbw m1, m2, m4 + punpcklbw m2, m4 +%if %3 pmaddubsw m4, m9, m1 - pmaddubsw m1, m9, m3 + pmaddubsw m1, m9, m2 +%else + pmaddubsw m4, m3, m1 + pmaddubsw m1, m3, m2 +%endif pmulhrsw m4, m8 pmulhrsw m1, m8 packsswb m1, m4 @@ -2883,17 +3195,17 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ ; unpack chroma source pxor m4, m4 - punpckhbw m3, m0, m4 + punpckhbw m5, m0, m4 punpcklbw m0, m4 ; m0-1: src as word ; dst = clip_pixel(src, noise) paddw m0, m2 - paddw m3, m1 + paddw m5, m1 pmaxsw m0, m13 - pmaxsw m3, m13 + pmaxsw m5, m13 pminsw m0, m12 - pminsw m3, m12 - packuswb m0, m3 + pminsw m5, m12 + packuswb m0, m5 movifnidn dstq, dstmp mova [dstq+srcq], m0 @@ -2901,12 +3213,36 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ add srcq, r2mp ; lumaq has been adjusted above already %else - add srcq, r10mp - lea lumaq, [lumaq+lstrideq*2] + add srcq, r12mp +%if %3 + lea lumaq, [lumaq+lstrideq*(1+%2)] +%else + add lumaq, r10mp +%endif %endif add grain_lutq, 82 dec hw +%if %3 jg %%loop_y_h_overlap +%else + jle %%end_y_hv_overlap +%if ARCH_X86_32 + mov r5, r5m + mova m3, [base+pb_17_27] +%else + mova m3, [pb_17_27] +%endif + btc hd, 16 + jnc %%loop_y_hv_overlap +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif + jmp %%loop_y_h_overlap +%%end_y_hv_overlap: +%if ARCH_X86_64 + mov lstrideq, r10mp +%endif +%endif %if ARCH_X86_32 DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut @@ -2921,18 +3257,44 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \ %else mov srcq, r11mp %endif - lea lumaq, [luma_bakq+wq*2] + lea lumaq, [luma_bakq+wq*(1+%2)] add srcq, wq %if ARCH_X86_32 mov r4m, wq mov r9m, lumaq %endif +%if %2 jmp %%loop_x_hv_overlap +%else +%if ARCH_X86_32 + add dword [rsp+8*mmsize+1*gprsize], 16 +%else + add top_offxyd, 16 +%endif + add offxyd, 16 + xor dword r8m, 4 + jmp %%loop_x_odd_v_overlap +%endif %%end_hv: RET %endmacro - FGUV_32x32xN_LOOP 1 + %%FGUV_32x32xN_LOOP 1, %2, %3 .csfl: - FGUV_32x32xN_LOOP 0 + %%FGUV_32x32xN_LOOP 0, %2, %3 +%endmacro + +FGUV_FN 420, 1, 1 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 422, 1, 0 + +%if STACK_ALIGNMENT < mmsize +DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 +%endif + +FGUV_FN 444, 0, 0 diff --git a/ffmpeg/JNI/dav1d/src/x86/ipred.asm b/ffmpeg/JNI/dav1d/src/x86/ipred.asm index 155f49004..ad05b3b1f 100644 --- a/ffmpeg/JNI/dav1d/src/x86/ipred.asm +++ b/ffmpeg/JNI/dav1d/src/x86/ipred.asm @@ -100,6 +100,8 @@ ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1 db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4; 0, 0, 0, 0 pw_64: times 2 dw 64 +cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1 + times 9 db 7, -1 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ; w=8, w_pad=1 as well as second half of previous one @@ -166,6 +168,7 @@ JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \ JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3 +JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32 JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64 cextern dr_intra_derivative @@ -1409,7 +1412,6 @@ ALIGN function_align mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases pcmpgtb m1, m2 pmovmskb r5d, m1 - popcnt r5d, r5d ; sets ZF which can be used by caller ret .w4_no_upsample: %assign stack_offset org_stack_offset @@ -1420,7 +1422,9 @@ ALIGN function_align lea maxbased, [hq+3] call .filter_strength mov maxbased, 7 + test r5d, r5d jz .w4_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m7, [base+pb_8] vbroadcasti128 m2, [tlq-1] pminub m1, m7, [base+z_filter_s] @@ -1593,7 +1597,9 @@ ALIGN function_align test angled, 0x400 jnz .w8_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .w8_main ; filter_strength == 0 + popcnt r5d, r5d movu xm2, [tlq] pminub xm1, xm0, [base+z_filter_s+14] vinserti128 m2, [tlq-1], 1 @@ -1695,7 +1701,9 @@ ALIGN function_align test angled, 0x400 jnz .w16_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .w16_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m1, [base+pb_12] vbroadcasti128 m6, [base+z_filter_s+8] vinserti128 m2, m6, [base+z_filter_s], 0 @@ -2202,7 +2210,6 @@ ALIGN function_align pand m0, m8, m7 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d ret ALIGN function_align .upsample_above: ; w4/w8 @@ -2252,7 +2259,9 @@ ALIGN function_align lea r3d, [hq+3] sub angled, 1112 ; angle - 90 call .filter_strength + test r3d, r3d jz .w4_no_filter_above + popcnt r3d, r3d vpbroadcastd xm2, [base+pb_4] pminub xm2, [base+z_filter_s] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] @@ -2287,9 +2296,10 @@ ALIGN function_align pand xm0, xm8 ; reuse from previous filter_strength call pcmpgtb xm0, xm9 pmovmskb r3d, xm0 - popcnt r3d, r3d .w4_filter_left: + test r3d, r3d jz .w4_main + popcnt r3d, r3d mov r5d, 10 cmp hd, 16 movu xm2, [rsp+49] @@ -2440,7 +2450,9 @@ ALIGN function_align lea r3d, [hq+7] sub angled, 90 ; angle - 90 call .filter_strength + test r3d, r3d jz .w8_no_filter_above + popcnt r3d, r3d vpbroadcastd xm3, [base+pb_8] pminub xm3, [base+z_filter_s+8] vpbroadcastd xm0, [base+z_filter_k-4+r3*4+12*0] @@ -2473,9 +2485,10 @@ ALIGN function_align pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d .w8_filter_left: + test r3d, r3d jz .w8_main + popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] @@ -2647,7 +2660,9 @@ ALIGN function_align lea r3d, [hq+15] sub angled, 90 call .filter_strength + test r3d, r3d jz .w16_no_filter_above + popcnt r3d, r3d vbroadcasti128 m6, [tlq+1] mova xm2, [base+z_filter_s] vinserti128 m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67 67 78 89 9a ab bc cd de @@ -2680,8 +2695,9 @@ ALIGN function_align pand m0, m8 pcmpgtb m0, m9 pmovmskb r3d, m0 - popcnt r3d, r3d + test r3d, r3d jz .w16_main + popcnt r3d, r3d vpbroadcastd m7, [base+z_filter_k-4+r3*4+12*0] vpbroadcastd m8, [base+z_filter_k-4+r3*4+12*1] vpbroadcastd m9, [base+z_filter_k-4+r3*4+12*2] @@ -3083,7 +3099,6 @@ ALIGN function_align mova xm2, [r4+angleq*8] pcmpgtb m1, m2 pmovmskb r5d, m1 - popcnt r5d, r5d ret .h4_no_upsample: %assign stack_offset org_stack_offset @@ -3094,7 +3109,9 @@ ALIGN function_align lea maxbased, [wq+3] call .filter_strength mov maxbased, 7 + test r5d, r5d jz .h4_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m7, [base+pb_7] vbroadcasti128 m2, [tlq-14] pmaxub m1, m7, [base+z_filter_s-4] @@ -3285,7 +3302,9 @@ ALIGN function_align test angled, 0x400 jnz .h8_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .h8_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd xm6, [base+pb_15] pcmpeqb xm1, xm1 psubusb xm6, xm0 @@ -3441,7 +3460,9 @@ ALIGN function_align test angled, 0x400 jnz .h16_no_intra_edge_filter call .filter_strength + test r5d, r5d jz .h16_main ; filter_strength == 0 + popcnt r5d, r5d vpbroadcastd m11, [base+pb_27] vpbroadcastd m1, [base+pb_1] vbroadcasti128 m6, [base+z_filter_s+12] @@ -5054,6 +5075,236 @@ cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak jg .sub_loop RET +cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak + movifnidn hpadd, hpadm + movifnidn wd, wm + mov hd, hm + mov szd, wd + imul szd, hd + shl hpadd, 2 + sub hd, hpadd + pxor m4, m4 + vpbroadcastd m5, [pw_1] + tzcnt r8d, wd + lea r5, [ipred_cfl_ac_444_avx2_table] + movsxd r8, [r5+r8*4+12] + add r5, r8 + + DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak + mov ac_bakq, acq + jmp r5 + +.w4: + lea stride3q, [strideq*3] + pxor xm2, xm2 +.w4_loop: + movd xm1, [yq] + movd xm0, [yq+strideq*2] + pinsrd xm1, [yq+strideq], 1 + pinsrd xm0, [yq+stride3q], 1 + punpcklbw xm1, xm2 + punpcklbw xm0, xm2 + psllw xm1, 3 + psllw xm0, 3 + mova [acq], xm1 + mova [acq+16], xm0 + paddw xm1, xm0 + paddw xm4, xm1 + lea yq, [yq+strideq*4] + add acq, 32 + sub hd, 4 + jg .w4_loop + test hpadd, hpadd + jz .calc_avg_mul + pshufd xm0, xm0, q3232 + paddw xm1, xm0, xm0 +.w4_hpad_loop: + mova [acq], xm0 + mova [acq+16], xm0 + paddw xm4, xm1 + add acq, 32 + sub hpadd, 4 + jg .w4_hpad_loop + jmp .calc_avg_mul + +.w8: + lea stride3q, [strideq*3] + pxor m2, m2 +.w8_loop: + movq xm1, [yq] + movq xm0, [yq+strideq*2] + vinserti128 m1, [yq+strideq], 1 + vinserti128 m0, [yq+stride3q], 1 + punpcklbw m1, m2 + punpcklbw m0, m2 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + paddw m4, m1 + lea yq, [yq+strideq*4] + add acq, 64 + sub hd, 4 + jg .w8_loop + test hpadd, hpadd + jz .calc_avg_mul + vpermq m0, m0, q3232 + paddw m1, m0, m0 +.w8_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddw m4, m1 + add acq, 64 + sub hpadd, 4 + jg .w8_hpad_loop + jmp .calc_avg_mul + +.w16: + test wpadd, wpadd + jnz .w16_wpad +.w16_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+strideq] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_loop + test hpadd, hpadd + jz .calc_avg + jmp .w16_hpad +.w16_wpad: + mova m3, [cfl_ac_444_w16_pad1_shuffle] +.w16_wpad_loop: + vpbroadcastq m1, [yq] + vpbroadcastq m0, [yq+strideq] + pshufb m1, m3 + pshufb m0, m3 + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m1, m0 + pmaddwd m1, m5 + paddd m4, m1 + lea yq, [yq+strideq*2] + add acq, 64 + sub hd, 2 + jg .w16_wpad_loop + test hpadd, hpadd + jz .calc_avg +.w16_hpad: + paddw m1, m0, m0 + pmaddwd m1, m5 +.w16_hpad_loop: + mova [acq], m0 + mova [acq+32], m0 + paddd m4, m1 + add acq, 64 + sub hpadd, 2 + jg .w16_hpad_loop + jmp .calc_avg + +.w32: + test wpadd, wpadd + jnz .w32_wpad +.w32_loop: + pmovzxbw m1, [yq] + pmovzxbw m0, [yq+16] + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jg .w32_loop + test hpadd, hpadd + jz .calc_avg + jmp .w32_hpad_loop +.w32_wpad: + DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak + lea iptrq, [ipred_cfl_ac_444_avx2_table] + add wpadd, wpadd + mova m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table] + movsxd wpadq, [iptrq+wpadq+4] + add iptrq, wpadq + jmp iptrq +.w32_pad3: + vpbroadcastq m1, [yq] + pshufb m1, m3 + vpermq m0, m1, q3232 + jmp .w32_wpad_end +.w32_pad2: + pmovzxbw m1, [yq] + pshufhw m0, m1, q3333 + vpermq m0, m0, q3333 + jmp .w32_wpad_end +.w32_pad1: + pmovzxbw m1, [yq] + vpbroadcastq m0, [yq+16] + pshufb m0, m3 + ; fall-through +.w32_wpad_end: + psllw m1, 3 + psllw m0, 3 + mova [acq], m1 + mova [acq+32], m0 + paddw m2, m1, m0 + pmaddwd m2, m5 + paddd m4, m2 + add yq, strideq + add acq, 64 + dec hd + jz .w32_wpad_done + jmp iptrq +.w32_wpad_done: + test hpadd, hpadd + jz .calc_avg +.w32_hpad_loop: + mova [acq], m1 + mova [acq+32], m0 + paddd m4, m2 + add acq, 64 + dec hpadd + jg .w32_hpad_loop + jmp .calc_avg + +.calc_avg_mul: + pmaddwd m4, m5 +.calc_avg: + vextracti128 xm1, m4, 1 + tzcnt r1d, szd + paddd xm0, xm4, xm1 + movd xm2, r1d + movd xm3, szd + punpckhqdq xm1, xm0, xm0 + paddd xm0, xm1 + psrad xm3, 1 + psrlq xm1, xm0, 32 + paddd xm0, xm3 + paddd xm0, xm1 + psrad xm0, xm2 + vpbroadcastw m0, xm0 +.sub_loop: + mova m1, [ac_bakq] + psubw m1, m0 + mova [ac_bakq], m1 + add ac_bakq, 32 + sub szd, 16 + jg .sub_loop + RET + cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h vbroadcasti128 m4, [palq] lea r2, [pal_pred_avx2_table] diff --git a/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c index 28f6f324d..4219ab8b1 100644 --- a/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c @@ -50,6 +50,7 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2); decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2); decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2); +decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2); decl_pal_pred_fn(dav1d_pal_pred_avx2); @@ -131,6 +132,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2; c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2; + c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2; c->pal_pred = dav1d_pal_pred_avx2; #endif diff --git a/ffmpeg/JNI/dav1d/src/x86/itx.asm b/ffmpeg/JNI/dav1d/src/x86/itx.asm index e964070b1..f27b90032 100644 --- a/ffmpeg/JNI/dav1d/src/x86/itx.asm +++ b/ffmpeg/JNI/dav1d/src/x86/itx.asm @@ -27,15 +27,10 @@ %if ARCH_X86_64 -SECTION_RODATA 32 +SECTION_RODATA 16 ; Note: The order of (at least some of) those constants matter! -iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 -iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424 -iadst4_dconly1a: dw 10568, 19856, 26752, 30424 -iadst4_dconly1b: dw 30424, 26752, 19856, 10568 - deint_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 %macro COEF_PAIR 2 @@ -132,7 +127,7 @@ SECTION .text ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a ; single rip-relative lea and then address things relative from that with ; 1-byte offsets as long as data is within +-128 bytes of the base pointer. -%define o_base iadst4_dconly2a + 128 +%define o_base deint_shuf + 128 %define o(x) (rax - (o_base) + (x)) %macro REPX 2-* @@ -180,16 +175,16 @@ SECTION .text vpbroadcastd m%3, [o(pw_%8_%9)] vpbroadcastd m%4, [o(pw_m%9_%8)] vpbroadcastd xm%2, [o(pw_%6_%7)] - vpblendd m%2, m%2, m%3, 0xf0 + vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_m%7_%6)] %else vpbroadcastd m%3, [o(pw_m%9_%8)] vpbroadcastd m%4, [o(pw_%8_%9)] vpbroadcastd xm%2, [o(pw_m%7_%6)] - vpblendd m%2, m%2, m%3, 0xf0 + vpblendd m%2, m%3, 0xf0 vpbroadcastd xm%3, [o(pw_%6_%7)] %endif - vpblendd m%3, m%3, m%4, 0xf0 + vpblendd m%3, m%4, 0xf0 ITX_MUL2X_PACK %1, %4, _, %5, %2, %3, (4|%10) %endmacro @@ -360,21 +355,17 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c punpckhdq m1, m0, m3 punpckldq m0, m3 IWHT4_1D_PACKED - vpblendd m0, m0, m2, 0x03 + vpblendd m0, m2, 0x03 ITX4_END 3, 0, 2, 1, 0 -%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size -cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2 - %undef cmp - %define %%p1 m(i%1_%4_internal) +%macro INV_TXFM_FN 3 ; type1, type2, size +cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2 + %define %%p1 m(i%1_%3_internal) lea rax, [o_base] ; Jump to the 1st txfm function if we're not taking the fast path, which ; in turn performs an indirect jump to the 2nd txfm function. - lea tx2q, [m(i%2_%4_internal).pass2] -%if %3 > 0 - cmp eobd, %3 - jg %%p1 -%elif %3 == 0 + lea tx2q, [m(i%2_%3_internal).pass2] +%ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else @@ -385,54 +376,16 @@ ALIGN function_align %endif %endmacro -%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x4 -%ifidn %1_%2, dct_identity - vpbroadcastd m0, [o(pw_2896x8)] - pmulhrsw m0, [cq] - vpbroadcastd m1, [o(pw_1697x8)] - pmulhrsw m1, m0 - paddsw m0, m1 - punpcklwd m0, m0 - punpckhdq m1, m0, m0 - punpckldq m0, m0 - jmp m(iadst_4x4_internal).end -%elifidn %1_%2, identity_dct - mova m0, [cq+16*0] - packusdw m0, [cq+16*1] - vpbroadcastd m1, [o(pw_1697x8)] - vpbroadcastd m2, [o(pw_2896x8)] - packusdw m0, m0 - pmulhrsw m1, m0 - paddsw m0, m1 - pmulhrsw m0, m2 - mova m1, m0 - jmp m(iadst_4x4_internal).end -%elif %3 >= 0 +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4 +%ifidn %1_%2, dct_dct vpbroadcastw m0, [cq] -%ifidn %1, dct vpbroadcastd m1, [o(pw_2896x8)] pmulhrsw m0, m1 -%elifidn %1, adst - movddup m1, [o(iadst4_dconly1a)] - pmulhrsw m0, m1 -%elifidn %1, flipadst - movddup m1, [o(iadst4_dconly1b)] - pmulhrsw m0, m1 -%endif mov [cq], eobd ; 0 -%ifidn %2, dct -%ifnidn %1, dct - vpbroadcastd m1, [o(pw_2896x8)] -%endif pmulhrsw m0, m1 mova m1, m0 jmp m(iadst_4x4_internal).end2 -%else ; adst / flipadst - pmulhrsw m1, m0, [o(iadst4_dconly2b)] - pmulhrsw m0, [o(iadst4_dconly2a)] - jmp m(i%2_4x4_internal).end2 -%endif %endif %endmacro @@ -477,10 +430,10 @@ ALIGN function_align packssdw m1, m2 ; out2 out3 %endmacro -INV_TXFM_4X4_FN dct, dct, 0 -INV_TXFM_4X4_FN dct, adst, 0 -INV_TXFM_4X4_FN dct, flipadst, 0 -INV_TXFM_4X4_FN dct, identity, 3 +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova m0, [cq+16*0] @@ -488,7 +441,7 @@ cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 IDCT4_1D_PACKED mova m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 - shufps m0, m0, m1, q0220 + shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q @@ -499,9 +452,9 @@ cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 mova [cq+16*1], m2 ITX4_END 0, 1, 3, 2 -INV_TXFM_4X4_FN adst, dct, 0 -INV_TXFM_4X4_FN adst, adst, 0 -INV_TXFM_4X4_FN adst, flipadst, 0 +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 @@ -526,9 +479,9 @@ ALIGN function_align IADST4_1D_PACKED ret -INV_TXFM_4X4_FN flipadst, dct, 0 -INV_TXFM_4X4_FN flipadst, adst, 0 -INV_TXFM_4X4_FN flipadst, flipadst, 0 +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 @@ -549,7 +502,7 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 .end2: ITX4_END 3, 2, 1, 0 -INV_TXFM_4X4_FN identity, dct, 3 +INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity @@ -600,38 +553,9 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 pextrd [r2 +r3 ], xm5, 3 %endmacro -%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x8 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - vpbroadcastd xm0, [o(pw_2896x8)] - pmulhrsw xm1, xm0, [cq] - vpbroadcastd xm2, [o(pw_4096)] - pmulhrsw xm1, xm0 - pmulhrsw xm1, xm2 - vpermq m1, m1, q1100 - punpcklwd m1, m1 - punpckldq m0, m1, m1 - punpckhdq m1, m1 - jmp m(iadst_4x8_internal).end3 -%elifidn %1_%2, identity_dct - movd xm0, [cq+16*0] - punpcklwd xm0, [cq+16*1] - movd xm1, [cq+16*2] - punpcklwd xm1, [cq+16*3] - vpbroadcastd xm2, [o(pw_2896x8)] - vpbroadcastd xm3, [o(pw_1697x8)] - vpbroadcastd xm4, [o(pw_2048)] - punpckldq xm0, xm1 - pmulhrsw xm0, xm2 - pmulhrsw xm3, xm0 - paddsw xm0, xm3 - pmulhrsw xm0, xm2 - pmulhrsw xm0, xm4 - vpbroadcastq m0, xm0 - mova m1, m0 - jmp m(iadst_4x8_internal).end3 -%elifidn %1_%2, dct_dct +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8 +%ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_2048)] @@ -641,24 +565,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 - jmp m(iadst_4x8_internal).end4 -%else ; adst_dct / flipadst_dct - vpbroadcastw xm0, [cq] - vpbroadcastd xm1, [o(pw_2896x8)] - pmulhrsw xm0, xm1 - pmulhrsw xm0, [o(iadst4_dconly1a)] - vpbroadcastd xm2, [o(pw_2048)] - mov [cq], eobd - pmulhrsw xm0, xm1 - pmulhrsw xm0, xm2 -%ifidn %1, adst - vpbroadcastq m0, xm0 -%else ; flipadst - vpermq m0, m0, q1111 -%endif - mova m1, m0 - jmp m(iadst_4x8_internal).end4 -%endif + jmp m(iadst_4x8_internal).end3 %endif %endmacro @@ -760,9 +667,9 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 paddsw m4, m5 ; out6 -out1 vpbroadcastd m5, [o(pw_2896x8)] vpblendd m3, m0, m4, 0x33 ; out6 -out7 - vpblendd m0, m0, m4, 0xcc ; out0 -out1 + vpblendd m0, m4, 0xcc ; out0 -out1 shufps m4, m2, m1, q1032 ; t3 t7 - vpblendd m1, m2, m1, 0xcc ; t2 t6 + vpblendd m1, m2, 0x33 ; t2 t6 psubsw m2, m1, m4 ; t2-t3 t6-t7 paddsw m1, m4 ; t2+t3 t6+t7 pmulhrsw m2, m5 ; out4 -out5 @@ -772,10 +679,10 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2 %endmacro INIT_YMM avx2 -INV_TXFM_4X8_FN dct, dct, 0 -INV_TXFM_4X8_FN dct, identity, 7 +INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 @@ -786,7 +693,7 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 IDCT4_1D_PACKED vbroadcasti128 m2, [o(deint_shuf)] shufps m3, m0, m1, q1331 - shufps m0, m0, m1, q0220 + shufps m0, m1, q0220 pshufb m0, m2 pshufb m1, m3, m2 jmp tx2q @@ -795,8 +702,8 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vextracti128 xm3, m1, 1 call .main vpbroadcastd m4, [o(pw_2048)] - vinserti128 m0, m0, xm2, 1 - vinserti128 m1, m1, xm3, 1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 pshufd m1, m1, q1032 jmp m(iadst_4x8_internal).end2 ALIGN function_align @@ -804,7 +711,7 @@ ALIGN function_align WRAP_XMM IDCT8_1D_PACKED ret -INV_TXFM_4X8_FN adst, dct, 0 +INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity @@ -828,21 +735,20 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pshufd xm5, xm1, q1032 call .main_pass2 vpbroadcastd m4, [o(pw_2048)] - vinserti128 m0, m0, xm2, 1 - vinserti128 m1, m1, xm3, 1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 pxor m5, m5 psubw m5, m4 .end: - vpblendd m4, m4, m5, 0xcc + vpblendd m4, m5, 0xcc .end2: pmulhrsw m0, m4 pmulhrsw m1, m4 WIN64_RESTORE_XMM -.end3: pxor m2, m2 mova [cq+32*0], m2 mova [cq+32*1], m2 -.end4: +.end3: lea r2, [dstq+strideq*4] lea r3, [strideq*3] WRITE_4X8 0, 1 @@ -856,7 +762,7 @@ ALIGN function_align WRAP_XMM IADST8_1D_PACKED 2 ret -INV_TXFM_4X8_FN flipadst, dct, 0 +INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity @@ -880,15 +786,15 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pshufd xm5, xm1, q1032 call m(iadst_4x8_internal).main_pass2 vpbroadcastd m5, [o(pw_2048)] - vinserti128 m3, m3, xm1, 1 - vinserti128 m2, m2, xm0, 1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 pxor m4, m4 psubw m4, m5 pshufd m0, m3, q1032 pshufd m1, m2, q1032 jmp m(iadst_4x8_internal).end -INV_TXFM_4X8_FN identity, dct, 3 +INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity @@ -913,49 +819,9 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_4x8_internal).end2 -%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x16 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - vpbroadcastd m0, [o(pw_2896x8)] - pmulhrsw m0, [cq] - vpbroadcastd m1, [o(pw_16384)] - vpbroadcastd m2, [o(pw_1697x16)] - vpbroadcastd m3, [o(pw_2048)] - pmulhrsw m0, m1 - pmulhrsw m2, m0 - paddsw m0, m0 - paddsw m0, m2 - pmulhrsw m3, m0 - punpcklwd m1, m3, m3 - punpckhwd m3, m3 - punpckldq m0, m1, m1 - punpckhdq m1, m1 - punpckldq m2, m3, m3 - punpckhdq m3, m3 - jmp m(iadst_4x16_internal).end3 -%elifidn %1_%2, identity_dct - movd xm0, [cq+32*0] - punpcklwd xm0, [cq+32*1] - movd xm1, [cq+32*2] - punpcklwd xm1, [cq+32*3] - vpbroadcastd xm2, [o(pw_1697x8)] - vpbroadcastd xm3, [o(pw_2896x8)] - vpbroadcastd xm4, [o(pw_2048)] - punpckldq xm0, xm1 - pcmpeqw xm1, xm1 - pmulhrsw xm2, xm0 - pcmpeqw xm1, xm0 - pxor xm0, xm1 - pavgw xm0, xm2 - pmulhrsw xm0, xm3 - pmulhrsw xm0, xm4 - vpbroadcastq m0, xm0 - mova m1, m0 - mova m2, m0 - mova m3, m0 - jmp m(iadst_4x16_internal).end3 -%elifidn %1_%2, dct_dct +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16 +%ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] @@ -968,27 +834,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 mova m1, m0 mova m2, m0 mova m3, m0 - jmp m(iadst_4x16_internal).end4 -%else ; adst_dct / flipadst_dct - vpbroadcastw xm0, [cq] - pmulhrsw xm0, [o(iadst4_dconly1a)] - vpbroadcastd xm1, [o(pw_16384)] - vpbroadcastd xm2, [o(pw_2896x8)] - mov [cq], eobd - pmulhrsw xm0, xm1 - psrlw xm1, 3 ; pw_2048 - pmulhrsw xm0, xm2 - pmulhrsw xm0, xm1 -%ifidn %1, adst - vpbroadcastq m0, xm0 -%else ; flipadst - vpermq m0, m0, q1111 -%endif - mova m1, m0 - mova m2, m0 - mova m3, m0 - jmp m(iadst_4x16_internal).end4 -%endif + jmp m(iadst_4x16_internal).end3 %endif %endmacro @@ -1038,7 +884,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd m5, [o(pw_2896_2896)] ITX_MUL2X_PACK 1, 0, _, 10, 0, 5, 4 ; t6 t5 vpbroadcastd m0, [o(pw_m2896_2896)] - ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4, ; t13a t10a + ITX_MUL2X_PACK 2, 0, _, 10, 0, 5, 4 ; t13a t10a punpckhqdq m0, m8, m3 ; t15a t14 punpcklqdq m8, m3 ; t8a t9 shufps m5, m4, m2, q1032 ; t12 t13a @@ -1061,10 +907,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 paddsw m3, m8 ; out7 out6 %endmacro -INV_TXFM_4X16_FN dct, dct, 0 -INV_TXFM_4X16_FN dct, identity, 15 +INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova m0, [cq+32*0] @@ -1089,11 +935,11 @@ cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 vextracti128 xm6, m2, 1 vextracti128 xm7, m3, 1 call .main - vinserti128 m0, m0, xm4, 1 - vinserti128 m1, m1, xm5, 1 + vinserti128 m0, xm4, 1 + vinserti128 m1, xm5, 1 vpbroadcastd m5, [o(pw_2048)] - vinserti128 m2, m2, xm6, 1 - vinserti128 m3, m3, xm7, 1 + vinserti128 m2, xm6, 1 + vinserti128 m3, xm7, 1 pshufd m1, m1, q1032 pshufd m3, m3, q1032 jmp m(iadst_4x16_internal).end2 @@ -1102,7 +948,7 @@ ALIGN function_align WRAP_XMM IDCT16_1D_PACKED ret -INV_TXFM_4X16_FN adst, dct, 0 +INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity @@ -1134,26 +980,25 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpbroadcastd m5, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m1, m0, 0x33 - vpblendd m0, m0, m2, 0x33 - vpblendd m2, m2, m3, 0x33 - vpblendd m3, m3, m1, 0x33 + vpblendd m0, m2, 0x33 + vpblendd m2, m3, 0x33 + vpblendd m3, m1, 0x33 vpermq m0, m0, q2031 vpermq m1, m2, q1302 vpermq m2, m3, q3120 vpermq m3, m4, q0213 psubw m6, m7, m5 .end: - vpblendd m5, m5, m6, 0xcc + vpblendd m5, m6, 0xcc .end2: REPX {pmulhrsw x, m5}, m0, m1, m2, m3 WIN64_RESTORE_XMM -.end3: pxor m4, m4 mova [cq+32*0], m4 mova [cq+32*1], m4 mova [cq+32*2], m4 mova [cq+32*3], m4 -.end4: +.end3: lea r2, [dstq+strideq*8] lea r3, [strideq*3] WRITE_4X8 0, 1 @@ -1164,9 +1009,9 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 ALIGN function_align .main: vpblendd m4, m1, m0, 0xcc - vpblendd m1, m1, m0, 0x33 + vpblendd m1, m0, 0x33 vpblendd m5, m2, m3, 0xcc - vpblendd m2, m2, m3, 0x33 + vpblendd m2, m3, 0x33 vperm2i128 m3, m5, m2, 0x31 vinserti128 m0, m1, xm4, 1 ; in0 in3 in2 in1 vperm2i128 m4, m1, m4, 0x31 @@ -1198,23 +1043,23 @@ ALIGN function_align psubsw m1, m2, m3 ; t13a t12a t15a t14a paddsw m2, m3 ; t9a t8a t11a t10a psubw m3, m7, m6 ; pw_3784_m1567 - vpblendd m6, m6, m3, 0xf0 + vpblendd m6, m3, 0xf0 ITX_MUL2X_PACK 4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a ITX_MUL2X_PACK 1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14 vbroadcasti128 m5, [o(deint_shuf)] pshufb m0, m5 pshufb m2, m5 vperm2i128 m3, m0, m2, 0x31 ; t3 t2 t11a t10a - vinserti128 m0, m0, xm2, 1 ; t1 t0 t9a t8a + vinserti128 m0, xm2, 1 ; t1 t0 t9a t8a vperm2i128 m2, m4, m1, 0x31 ; t7a t6a t15 t14 - vinserti128 m4, m4, xm1, 1 ; t4a t5a t12 t13 + vinserti128 m4, xm1, 1 ; t4a t5a t12 t13 pshufd m2, m2, q1032 ; t6a t7a t14 t15 psubsw m1, m0, m3 ; t3a t2a t11 t10 paddsw m0, m3 ; -out15 out0 out14 -out1 paddsw m3, m4, m2 ; -out3 out12 out2 -out13 psubsw m4, m2 ; t6 t7 t14a t15a shufps m2, m1, m4, q1032 ; t2a t6 t10 t14a - vpblendd m4, m4, m1, 0x33 ; t3a t7 t11 t15a + vpblendd m4, m1, 0x33 ; t3a t7 t11 t15a ret ALIGN function_align .main_pass1_end: @@ -1232,7 +1077,7 @@ ALIGN function_align packssdw m1, m4 ; -out7 out4 out6 -out5 ret -INV_TXFM_4X16_FN flipadst, dct, 0 +INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity @@ -1264,9 +1109,9 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 vpbroadcastd m6, [o(pw_2048)] pshufd m1, m1, q1032 vpblendd m4, m0, m2, 0x33 - vpblendd m0, m0, m1, 0xcc - vpblendd m1, m1, m3, 0xcc - vpblendd m2, m2, m3, 0x33 + vpblendd m0, m1, 0xcc + vpblendd m1, m3, 0xcc + vpblendd m2, m3, 0x33 vpermq m0, m0, q3120 vpermq m1, m1, q0213 vpermq m2, m2, q2031 @@ -1274,7 +1119,7 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 psubw m5, m7, m6 jmp m(iadst_4x16_internal).end -INV_TXFM_4X16_FN identity, dct, 3 +INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity @@ -1325,7 +1170,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 paddsw m3, m8 jmp m(iadst_4x16_internal).end2 -%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3] +%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3] movq xm%3, [dstq ] movhps xm%3, [dstq+%5] movq xm%4, [dstq+%6] @@ -1350,69 +1195,25 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2 movhps [dstq+%7], xm%4 %endmacro -%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x4 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - vpbroadcastd xm0, [o(pw_2896x8)] - pmulhrsw xm1, xm0, [cq] - vpbroadcastd xm2, [o(pw_1697x8)] - vpbroadcastd xm3, [o(pw_2048)] - pmulhrsw xm1, xm0 - pmulhrsw xm2, xm1 - paddsw xm1, xm2 - pmulhrsw xm1, xm3 - punpcklwd xm1, xm1 - punpckldq xm0, xm1, xm1 - punpckhdq xm1, xm1 - vpermq m0, m0, q1100 - vpermq m1, m1, q1100 -%elifidn %1_%2, identity_dct - mova xm0, [cq+16*0] - packusdw xm0, [cq+16*1] - mova xm1, [cq+16*2] - packusdw xm1, [cq+16*3] - vpbroadcastd xm2, [o(pw_2896x8)] - vpbroadcastd xm3, [o(pw_2048)] - packusdw xm0, xm1 - pmulhrsw xm0, xm2 - paddsw xm0, xm0 - pmulhrsw xm0, xm2 - pmulhrsw xm0, xm3 - vinserti128 m0, m0, xm0, 1 - mova m1, m0 -%else +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4 +%ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] pmulhrsw xm0, xm1 -%ifidn %2, dct movd xm2, [o(pw_2048)] pmulhrsw xm0, xm1 pmulhrsw xm0, xm2 vpbroadcastw m0, xm0 mova m1, m0 -%else ; adst / flipadst - vpbroadcastw m0, xm0 - pmulhrsw m0, [o(iadst4_dconly2a)] - vpbroadcastd m1, [o(pw_2048)] - pmulhrsw m1, m0 -%ifidn %2, adst - vpermq m0, m1, q1100 - vpermq m1, m1, q3322 -%else ; flipadst - vpermq m0, m1, q2233 - vpermq m1, m1, q0011 -%endif -%endif -%endif jmp m(iadst_8x4_internal).end3 %endif %endmacro -INV_TXFM_8X4_FN dct, dct, 0 -INV_TXFM_8X4_FN dct, adst, 0 -INV_TXFM_8X4_FN dct, flipadst, 0 -INV_TXFM_8X4_FN dct, identity, 3 +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd xm3, [o(pw_2896x8)] @@ -1425,7 +1226,7 @@ cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vinserti128 m3, m1, xm3, 1 vinserti128 m1, m0, xm2, 1 shufps m0, m1, m3, q0220 - shufps m1, m1, m3, q1331 + shufps m1, m3, q1331 pshufb m0, m4 pshufb m1, m4 jmp tx2q @@ -1449,8 +1250,8 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal).main_pass1 - vinserti128 m0, m0, xm2, 1 - vinserti128 m1, m1, xm3, 1 + vinserti128 m0, xm2, 1 + vinserti128 m1, xm3, 1 punpckhwd m2, m0, m1 punpcklwd m0, m1 pxor m3, m3 @@ -1494,8 +1295,8 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw xm4, xm0 pmulhrsw xm5, xm0 call m(iadst_4x8_internal).main_pass1 - vinserti128 m3, m3, xm1, 1 - vinserti128 m2, m2, xm0, 1 + vinserti128 m3, xm1, 1 + vinserti128 m2, xm0, 1 punpckhwd m1, m3, m2 punpcklwd m3, m2 pxor m0, m0 @@ -1510,16 +1311,16 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, m2, q2031 jmp m(iadst_8x4_internal).end2 -INV_TXFM_8X4_FN identity, dct, 7 +INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 - mova xm2, [cq+16*0] - mova xm0, [cq+16*1] - vinserti128 m2, m2, [cq+16*2], 1 - vinserti128 m0, m0, [cq+16*3], 1 + mova xm2, [cq+16*0] + mova xm0, [cq+16*1] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m0, [cq+16*3], 1 vpbroadcastd m3, [o(pw_2896x8)] punpcklwd m1, m2, m0 punpckhwd m2, m0 @@ -1538,25 +1339,9 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 paddsw m1, m3 jmp m(iadst_8x4_internal).end -%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x8 -%ifidn %1_%2, dct_identity - vpbroadcastd xm0, [o(pw_2896x8)] - pmulhrsw xm0, [cq] - vpbroadcastd xm1, [o(pw_16384)] - pmulhrsw xm0, xm1 - psrlw xm1, 2 ; pw_4096 - pmulhrsw xm0, xm1 - pshufb xm0, [o(deint_shuf)] - vpermq m3, m0, q1100 - punpcklwd m3, m3 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - jmp m(iadst_8x8_internal).end4 -%elif %3 >= 0 -%ifidn %1, dct +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8 +%ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] movd xm2, [o(pw_16384)] @@ -1576,33 +1361,13 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2 dec r2d jg .loop RET -%else ; identity - mova m0, [cq+32*0] - punpcklwd m0, [cq+32*1] - mova m1, [cq+32*2] - punpcklwd m1, [cq+32*3] - vpbroadcastd m2, [o(pw_2896x8)] - vpbroadcastd m3, [o(pw_2048)] - pxor m4, m4 - mova [cq+32*0], m4 - mova [cq+32*1], m4 - mova [cq+32*2], m4 - mova [cq+32*3], m4 - punpckldq m0, m1 - vpermq m1, m0, q3232 - vpermq m0, m0, q1010 - punpcklwd m0, m1 - pmulhrsw m0, m2 - pmulhrsw m0, m3 - jmp m(inv_txfm_add_dct_dct_8x8).end -%endif %endif %endmacro -INV_TXFM_8X8_FN dct, dct, 0 -INV_TXFM_8X8_FN dct, identity, 7 +INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpermq m0, [cq+32*0], q3120 ; 0 1 @@ -1749,20 +1514,20 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw m0, m5, m4 jmp m(iadst_8x8_internal).end3 -INV_TXFM_8X8_FN identity, dct, 7 +INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 - mova xm3, [cq+16*0] - mova xm2, [cq+16*1] - vinserti128 m3, m3, [cq+16*4], 1 - vinserti128 m2, m2, [cq+16*5], 1 - mova xm4, [cq+16*2] - mova xm0, [cq+16*3] - vinserti128 m4, m4, [cq+16*6], 1 - vinserti128 m0, m0, [cq+16*7], 1 + mova xm3, [cq+16*0] + mova xm2, [cq+16*1] + vinserti128 m3, [cq+16*4], 1 + vinserti128 m2, [cq+16*5], 1 + mova xm4, [cq+16*2] + mova xm0, [cq+16*3] + vinserti128 m4, [cq+16*6], 1 + vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m0 @@ -1776,8 +1541,8 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal).end -%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x16 +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] @@ -1791,66 +1556,6 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 vpbroadcastw m0, xm0 mov r2d, 4 jmp m(inv_txfm_add_dct_dct_8x8).end2 -%elifidn %1_%2, dct_identity - WIN64_SPILL_XMM 13 - vpbroadcastd m0, [o(pw_2896x8)] - pmulhrsw m7, m0, [cq] - vpbroadcastd m1, [o(pw_16384)] - vpbroadcastd m2, [o(pw_1697x16)] - pxor m3, m3 - mova [cq], m3 - pmulhrsw m7, m0 - pmulhrsw m7, m1 - psrlw m1, 3 ; pw_2048 - pmulhrsw m2, m7 - paddsw m7, m7 - paddsw m7, m2 - pmulhrsw m7, m1 - punpcklwd m5, m7, m7 - punpckhwd m7, m7 - punpcklwd m4, m5, m5 - punpckhwd m5, m5 - punpcklwd m6, m7, m7 - punpckhwd m7, m7 - vpermq m0, m4, q1100 - vpermq m1, m5, q1100 - vpermq m2, m6, q1100 - vpermq m3, m7, q1100 - vpermq m4, m4, q3322 - vpermq m5, m5, q3322 - vpermq m6, m6, q3322 - vpermq m7, m7, q3322 - jmp m(idct_8x16_internal).end4 -%elifidn %1_%2, identity_dct - movd xm0, [cq+32*0] - punpcklwd xm0, [cq+32*1] - movd xm2, [cq+32*2] - punpcklwd xm2, [cq+32*3] - add cq, 32*4 - movd xm1, [cq+32*0] - punpcklwd xm1, [cq+32*1] - movd xm3, [cq+32*2] - punpcklwd xm3, [cq+32*3] - vpbroadcastd xm4, [o(pw_2896x8)] - vpbroadcastd xm5, [o(pw_2048)] - xor eax, eax - mov [cq-32*4], eax - mov [cq-32*3], eax - mov [cq-32*2], eax - mov [cq-32*1], eax - punpckldq xm0, xm2 - punpckldq xm1, xm3 - punpcklqdq xm0, xm1 - pmulhrsw xm0, xm4 - pmulhrsw xm0, xm4 - pmulhrsw xm0, xm5 - mov [cq+32*0], eax - mov [cq+32*1], eax - mov [cq+32*2], eax - mov [cq+32*3], eax - vinserti128 m0, m0, xm0, 1 - mov r2d, 4 - jmp m(inv_txfm_add_dct_dct_8x8).end2 %endif %endmacro @@ -1867,10 +1572,10 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2 pmulhrsw m4, [cq+32*0] %endmacro -INV_TXFM_8X16_FN dct, dct, 0 -INV_TXFM_8X16_FN dct, identity, 15 +INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_8X16_LOAD_COEFS @@ -1878,13 +1583,13 @@ cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 vpbroadcastd m10, [o(pw_16384)] .pass1_end: vperm2i128 m9, m3, m7, 0x31 - vinserti128 m3, m3, xm7, 1 + vinserti128 m3, xm7, 1 vperm2i128 m8, m2, m6, 0x31 - vinserti128 m2, m2, xm6, 1 + vinserti128 m2, xm6, 1 vperm2i128 m6, m1, m5, 0x31 - vinserti128 m1, m1, xm5, 1 + vinserti128 m1, xm5, 1 vperm2i128 m5, m0, m4, 0x31 - vinserti128 m0, m0, xm4, 1 + vinserti128 m0, xm4, 1 punpckhwd m4, m2, m3 punpcklwd m2, m3 punpckhwd m3, m0, m1 @@ -1915,7 +1620,6 @@ cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 .end3: pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 -.end4: lea r3, [strideq*3] WRITE_8X4 0, 1, 8, 9 lea dstq, [dstq+strideq*4] @@ -2120,7 +1824,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 pmulhrsw m7, m9, m8 jmp m(idct_8x16_internal).end3 -INV_TXFM_8X16_FN identity, dct, 7 +INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity @@ -2136,24 +1840,24 @@ INV_TXFM_8X16_FN identity, identity %endmacro cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 - mova xm3, [cq+16*0] - mova xm2, [cq+16*2] + mova xm3, [cq+16*0] + mova xm2, [cq+16*2] add cq, 16*8 - vinserti128 m3, m3, [cq+16*0], 1 - vinserti128 m2, m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*0], 1 + vinserti128 m2, [cq+16*2], 1 vpbroadcastd m9, [o(pw_2896x8)] - mova xm4, [cq-16*4] - mova xm5, [cq-16*2] - vinserti128 m4, m4, [cq+16*4], 1 - vinserti128 m5, m5, [cq+16*6], 1 - mova xm7, [cq-16*7] - mova xm6, [cq-16*5] - vinserti128 m7, m7, [cq+16*1], 1 - vinserti128 m6, m6, [cq+16*3], 1 - mova xm8, [cq-16*3] - mova xm0, [cq-16*1] - vinserti128 m8, m8, [cq+16*5], 1 - vinserti128 m0, m0, [cq+16*7], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*2] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*6], 1 + mova xm7, [cq-16*7] + mova xm6, [cq-16*5] + vinserti128 m7, [cq+16*1], 1 + vinserti128 m6, [cq+16*3], 1 + mova xm8, [cq-16*3] + mova xm0, [cq-16*1] + vinserti128 m8, [cq+16*5], 1 + vinserti128 m0, [cq+16*7], 1 punpcklwd m1, m3, m2 punpckhwd m3, m2 punpcklwd m2, m4, m5 @@ -2197,64 +1901,11 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 vextracti128 [dstq+%6], m%3, 1 %endmacro -%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x4 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - vpbroadcastd xm3, [o(pw_2896x8)] - pmulhrsw xm3, [cq] - vpbroadcastd xm0, [o(pw_16384)] - vpbroadcastd xm1, [o(pw_1697x8)] - pmulhrsw xm3, xm0 - psrlw xm0, 3 ; pw_2048 - pmulhrsw xm1, xm3 - paddsw xm3, xm1 - pmulhrsw xm3, xm0 - punpcklwd xm3, xm3 - punpckldq xm1, xm3, xm3 - punpckhdq xm3, xm3 - vpbroadcastq m0, xm1 - vpermq m1, m1, q1111 - vpbroadcastq m2, xm3 - vpermq m3, m3, q1111 - jmp m(iadst_16x4_internal).end2 -%elifidn %1_%2, identity_dct - mova xm0, [cq+16*0] - mova xm2, [cq+16*1] - vinserti128 m0, m0, [cq+16*4], 1 - vinserti128 m2, m2, [cq+16*5], 1 - mova xm1, [cq+16*2] - mova xm3, [cq+16*3] - vinserti128 m1, m1, [cq+16*6], 1 - vinserti128 m3, m3, [cq+16*7], 1 - vpbroadcastd m4, [o(pw_1697x16)] - vpbroadcastd m5, [o(pw_16384)] - packusdw m0, m2 - packusdw m1, m3 - packusdw m0, m1 - vpbroadcastd m1, [o(pw_2896x8)] - pmulhrsw m4, m0 - pmulhrsw m4, m5 - paddsw m0, m4 - psrlw m5, 3 ; pw_2048 - pmulhrsw m0, m1 - pmulhrsw m0, m5 - mov r3d, 2 -.end: - pxor m3, m3 -.end_loop: - mova [cq+32*0], m3 - mova [cq+32*1], m3 - add cq, 32*2 - WRITE_16X2 0, 0, 1, 2, strideq*0, strideq*1 - lea dstq, [dstq+strideq*2] - dec r3d - jg .end_loop - RET -%else +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4 +%ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] -%ifidn %2, dct movd xm2, [o(pw_16384)] mov [cq], eobd mov r2d, 2 @@ -2267,7 +1918,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 pxor m3, m3 .dconly_loop: mova xm1, [dstq] - vinserti128 m1, m1, [dstq+strideq], 1 + vinserti128 m1, [dstq+strideq], 1 punpckhbw m2, m1, m3 punpcklbw m1, m3 paddw m2, m0 @@ -2279,35 +1930,13 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2 dec r2d jg .dconly_loop RET -%else ; adst / flipadst - movd xm2, [o(pw_16384)] - pmulhrsw xm0, xm2 - vpbroadcastw m0, xm0 - pmulhrsw m0, [o(iadst4_dconly2a)] - vpbroadcastd m3, [o(pw_2048)] - mov [cq], eobd - pmulhrsw m3, m0 -%ifidn %2, adst - vpbroadcastq m0, xm3 - vpermq m1, m3, q1111 - vpermq m2, m3, q2222 - vpermq m3, m3, q3333 -%else ; flipadst - vpermq m0, m3, q3333 - vpermq m1, m3, q2222 - vpermq m2, m3, q1111 - vpbroadcastq m3, xm3 -%endif - jmp m(iadst_16x4_internal).end3 -%endif -%endif %endif %endmacro -INV_TXFM_16X4_FN dct, dct, 0 -INV_TXFM_16X4_FN dct, adst, 0 -INV_TXFM_16X4_FN dct, flipadst, 0 -INV_TXFM_16X4_FN dct, identity, 3 +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 mova xm0, [cq+16*0] @@ -2481,20 +2110,20 @@ ALIGN function_align WRITE_16X2 1, 0, 4, 5, strideq*0, strideq*1 RET -INV_TXFM_16X4_FN identity, dct, 15 +INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 - mova xm2, [cq+16*0] - mova xm4, [cq+16*1] - vinserti128 m2, m2, [cq+16*4], 1 - vinserti128 m4, m4, [cq+16*5], 1 - mova xm0, [cq+16*2] - mova xm1, [cq+16*3] - vinserti128 m0, m0, [cq+16*6], 1 - vinserti128 m1, m1, [cq+16*7], 1 + mova xm2, [cq+16*0] + mova xm4, [cq+16*1] + vinserti128 m2, [cq+16*4], 1 + vinserti128 m4, [cq+16*5], 1 + mova xm0, [cq+16*2] + mova xm1, [cq+16*3] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 vpbroadcastd m7, [o(pw_1697x16)] vpbroadcastd m8, [o(pw_16384)] punpcklwd m3, m2, m4 @@ -2531,8 +2160,8 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 paddsw m3, m7 jmp m(iadst_16x4_internal).end -%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x8 +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] @@ -2541,59 +2170,6 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 pmulhrsw xm0, xm1 mov r2d, 4 jmp m(inv_txfm_add_dct_dct_16x4).dconly -%elifidn %1_%2, dct_identity - WIN64_SPILL_XMM 13 - vbroadcasti128 m7, [cq] - vpbroadcastd m0, [o(pw_2896x8)] - vpbroadcastd m1, [o(pw_16384)] - pxor xm2, xm2 - mova [cq], xm2 - pmulhrsw m7, m0 - pmulhrsw m7, m0 - pmulhrsw m7, m1 - psrlw m1, 2 ; pw_4096 - pmulhrsw m7, m1 - punpcklwd m3, m7, m7 - punpckhwd m7, m7 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - pshufd m4, m7, q0000 - pshufd m5, m7, q1111 - pshufd m6, m7, q2222 - pshufd m7, m7, q3333 - lea r3, [strideq*3] - WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 - WRITE_16X2 2, 3, 0, 1, strideq*2, r3 - jmp m(idct_16x8_internal).end4 -%elifidn %1_%2, identity_dct - mova m0, [cq+32*0] - packusdw m0, [cq+32*1] - mova m2, [cq+32*2] - packusdw m2, [cq+32*3] - mova m1, [cq+32*4] - packusdw m1, [cq+32*5] - mova m3, [cq+32*6] - packusdw m3, [cq+32*7] - vpbroadcastd m4, [o(pw_2896x8)] - vpbroadcastd m5, [o(pw_1697x16)] - packusdw m0, m2 - packusdw m1, m3 - vpbroadcastd m2, [o(pw_16384)] - packusdw m0, m1 - vpermq m1, m0, q3322 - vpermq m0, m0, q1100 - punpcklwd m0, m1 - pmulhrsw m0, m4 - pmulhrsw m5, m0 - pmulhrsw m5, m2 - paddsw m0, m5 - psrlw m2, 3 ; pw_2048 - pmulhrsw m0, m4 - pmulhrsw m0, m2 - mov r3d, 4 - jmp m(inv_txfm_add_identity_dct_16x4).end %endif %endmacro @@ -2611,10 +2187,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2 REPX {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4 %endmacro -INV_TXFM_16X8_FN dct, dct, 0 -INV_TXFM_16X8_FN dct, identity, 7 +INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 ITX_16X8_LOAD_COEFS 3120 @@ -2648,13 +2224,13 @@ cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 punpckldq m8, m9, m5 punpckhdq m9, m5 vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, m0, xm6, 1 + vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, m1, xm7, 1 + vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m8, 0x31 - vinserti128 m2, m2, xm8, 1 + vinserti128 m2, xm8, 1 vperm2i128 m7, m3, m9, 0x31 - vinserti128 m3, m3, xm9, 1 + vinserti128 m3, xm9, 1 jmp tx2q .pass2: call .main @@ -2811,13 +2387,13 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 punpckldq m5, m8, m2 punpckhdq m8, m2 vinserti128 m2, m6, xm5, 1 - vperm2i128 m6, m6, m5, 0x31 + vperm2i128 m6, m5, 0x31 vperm2i128 m5, m1, m4, 0x31 - vinserti128 m1, m1, xm4, 1 + vinserti128 m1, xm4, 1 vperm2i128 m4, m0, m3, 0x31 - vinserti128 m0, m0, xm3, 1 + vinserti128 m0, xm3, 1 vinserti128 m3, m7, xm8, 1 - vperm2i128 m7, m7, m8, 0x31 + vperm2i128 m7, m8, 0x31 jmp tx2q .pass2: call m(iadst_16x8_internal).main @@ -2837,30 +2413,30 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 WRITE_16X2 1, 2, 0, 1, strideq*2, r3 jmp m(idct_16x8_internal).end3 -INV_TXFM_16X8_FN identity, dct, 15 +INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 - mova xm7, [cq+16*0] - mova xm2, [cq+16*1] + mova xm7, [cq+16*0] + mova xm2, [cq+16*1] add cq, 16*8 vpbroadcastd m3, [o(pw_2896x8)] - vinserti128 m7, m7, [cq+16*0], 1 - vinserti128 m2, m2, [cq+16*1], 1 - mova xm6, [cq-16*6] - mova xm4, [cq-16*5] - vinserti128 m6, m6, [cq+16*2], 1 - vinserti128 m4, m4, [cq+16*3], 1 - mova xm8, [cq-16*4] - mova xm5, [cq-16*3] - vinserti128 m8, m8, [cq+16*4], 1 - vinserti128 m5, m5, [cq+16*5], 1 - mova xm0, [cq-16*2] - mova xm1, [cq-16*1] - vinserti128 m0, m0, [cq+16*6], 1 - vinserti128 m1, m1, [cq+16*7], 1 + vinserti128 m7, [cq+16*0], 1 + vinserti128 m2, [cq+16*1], 1 + mova xm6, [cq-16*6] + mova xm4, [cq-16*5] + vinserti128 m6, [cq+16*2], 1 + vinserti128 m4, [cq+16*3], 1 + mova xm8, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m8, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm0, [cq-16*2] + mova xm1, [cq-16*1] + vinserti128 m0, [cq+16*6], 1 + vinserti128 m1, [cq+16*7], 1 vpbroadcastd m10, [o(pw_1697x16)] vpbroadcastd m11, [o(pw_16384)] REPX {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1 @@ -2896,8 +2472,8 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 %define o_base pw_5 + 128 -%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x16 +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16 %ifidn %1_%2, dct_dct movd xm1, [o(pw_2896x8)] pmulhrsw xm0, xm1, [cq] @@ -2905,72 +2481,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 mov [cq], eobd mov r2d, 8 jmp m(inv_txfm_add_dct_dct_16x4).dconly -%elifidn %1_%2, dct_identity - WIN64_SPILL_XMM 7 - vpbroadcastd m3, [o(pw_2896x8)] - pmulhrsw m3, [cq] - vpbroadcastd m0, [o(pw_8192)] - vpbroadcastd m1, [o(pw_1697x16)] - vpbroadcastw m4, [o(deint_shuf)] ; pb_0_1 - pcmpeqb m5, m5 - pxor m6, m6 - mova [cq], m6 - paddb m5, m5 ; pb_m2 - pmulhrsw m3, m0 - psrlw m0, 2 ; pw_2048 - IDTX16 3, 1, 1 - pmulhrsw m3, m0 - mov r3d, 8 -.loop: - mova xm1, [dstq] - vinserti128 m1, m1, [dstq+strideq*8], 1 - pshufb m0, m3, m4 - psubb m4, m5 ; += 2 - punpckhbw m2, m1, m6 - punpcklbw m1, m6 - paddw m2, m0 - paddw m1, m0 - packuswb m1, m2 - mova [dstq], xm1 - vextracti128 [dstq+strideq*8], m1, 1 - add dstq, strideq - dec r3d - jg .loop - RET -%elifidn %1_%2, identity_dct - movd xm0, [cq+32*0 ] - movd xm2, [cq+32*1 ] - movd xm1, [cq+32*2 ] - movd xm3, [cq+32*3 ] - vinserti128 m0, m0, [cq+32*8 ], 1 - vinserti128 m2, m2, [cq+32*9 ], 1 - vinserti128 m1, m1, [cq+32*10], 1 - vinserti128 m3, m3, [cq+32*11], 1 - punpcklwd m0, m2 - punpcklwd m1, m3 - punpckldq m0, m1 - movd xm1, [cq+32*4 ] - movd xm3, [cq+32*5 ] - movd xm2, [cq+32*6 ] - movd xm4, [cq+32*7 ] - vinserti128 m1, m1, [cq+32*12], 1 - vinserti128 m3, m3, [cq+32*13], 1 - vinserti128 m2, m2, [cq+32*14], 1 - vinserti128 m4, m4, [cq+32*15], 1 - punpcklwd m1, m3 - vpbroadcastd m3, [o(pw_1697x16)] - punpcklwd m2, m4 - vpbroadcastd m4, [o(pw_2896x8)] - punpckldq m1, m2 - vpbroadcastd m2, [o(pw_2048)] - punpcklqdq m0, m1 - pmulhrsw m3, m0 - psraw m3, 1 - pavgw m0, m3 - pmulhrsw m0, m4 - pmulhrsw m0, m2 - mov r3d, 8 - jmp m(inv_txfm_add_identity_dct_16x4).end %endif %endmacro @@ -2995,10 +2505,10 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2 mova [rsp], m15 %endmacro -INV_TXFM_16X16_FN dct, dct, 0 -INV_TXFM_16X16_FN dct, identity, 15 +INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 ITX_16X16_LOAD_COEFS @@ -3014,19 +2524,19 @@ cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 REPX {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15 pmulhrsw m1, [rsp+32*1] vperm2i128 m8, m1, m9, 0x31 - vinserti128 m1, m1, xm9, 1 + vinserti128 m1, xm9, 1 vperm2i128 m9, m2, m10, 0x31 - vinserti128 m2, m2, xm10, 1 + vinserti128 m2, xm10, 1 vperm2i128 m10, m3, m11, 0x31 - vinserti128 m3, m3, xm11, 1 + vinserti128 m3, xm11, 1 vperm2i128 m11, m4, m12, 0x31 - vinserti128 m4, m4, xm12, 1 + vinserti128 m4, xm12, 1 vperm2i128 m12, m5, m13, 0x31 - vinserti128 m5, m5, xm13, 1 + vinserti128 m5, xm13, 1 vperm2i128 m13, m6, m14, 0x31 - vinserti128 m6, m6, xm14, 1 + vinserti128 m6, xm14, 1 vperm2i128 m14, m7, m15, 0x31 - vinserti128 m7, m7, xm15, 1 + vinserti128 m7, xm15, 1 mova m15, [rsp+32*2] .pass1_end3: punpcklwd m0, m9, m10 @@ -3395,7 +2905,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 pavgw m%1, m%2 ; signs are guaranteed to be equal %endmacro -INV_TXFM_16X16_FN identity, dct, 15 +INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2 @@ -3456,7 +2966,7 @@ ALIGN function_align paddsw m15, m1 jmp m(idct_16x16_internal).end -%define o_base iadst4_dconly2a + 128 +%define o_base deint_shuf + 128 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2 %if %3 @@ -3526,13 +3036,13 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob LOAD_8ROWS cq+32*1, 32*2 call m(idct_16x8_internal).main vperm2i128 m11, m0, m4, 0x31 - vinserti128 m0, m0, xm4, 1 + vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 - vinserti128 m1, m1, xm5, 1 + vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 - vinserti128 m2, m2, xm6, 1 + vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 - vinserti128 m3, m3, xm7, 1 + vinserti128 m3, xm7, 1 pxor m7, m7 REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15 punpckhwd m7, m0, m1 @@ -3566,13 +3076,13 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob LOAD_8ROWS cq+32*0, 32*2 call m(idct_16x8_internal).main vperm2i128 m8, m0, m4, 0x31 - vinserti128 m0, m0, xm4, 1 + vinserti128 m0, xm4, 1 vperm2i128 m4, m1, m5, 0x31 - vinserti128 m1, m1, xm5, 1 + vinserti128 m1, xm5, 1 vperm2i128 m5, m2, m6, 0x31 - vinserti128 m2, m2, xm6, 1 + vinserti128 m2, xm6, 1 vperm2i128 m6, m3, m7, 0x31 - vinserti128 m3, m3, xm7, 1 + vinserti128 m3, xm7, 1 vpbroadcastd m9, [o(pw_8192)] pxor m7, m7 REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14 @@ -3775,7 +3285,7 @@ ALIGN function_align %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2] vbroadcasti128 m%1, [cq+16*%3] vbroadcasti128 m%2, [cq+16*%4] - shufpd m%1, m%1, m%2, 0x0c + shufpd m%1, m%2, 0x0c %endmacro cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob @@ -3877,13 +3387,13 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob pmulhrsw m12, [rsp+32*0] mova [rsp+32*0], m8 vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, m0, xm6, 1 + vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, m1, xm7, 1 + vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 - vinserti128 m2, m2, xm9, 1 + vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m10, 0x31 - vinserti128 m3, m3, xm10, 1 + vinserti128 m3, xm10, 1 call m(idct_16x8_internal).main vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -3922,13 +3432,13 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob punpckldq m9, m12, m5 punpckhdq m12, m5 vperm2i128 m4, m0, m6, 0x31 - vinserti128 m0, m0, xm6, 1 + vinserti128 m0, xm6, 1 vperm2i128 m5, m1, m7, 0x31 - vinserti128 m1, m1, xm7, 1 + vinserti128 m1, xm7, 1 vperm2i128 m6, m2, m9, 0x31 - vinserti128 m2, m2, xm9, 1 + vinserti128 m2, xm9, 1 vperm2i128 m7, m3, m12, 0x31 - vinserti128 m3, m3, xm12, 1 + vinserti128 m3, xm12, 1 call m(idct_16x8_internal).main2 vpbroadcastd m8, [o(pw_2048)] REPX {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -3947,26 +3457,26 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob lea r4, [strideq*3] sub eobd, 107 ; loop_iterations = 1 + (eobd >= 107) .loop: - mova xm0, [cq+16* 0] - mova xm1, [cq+16* 4] - vinserti128 m0, m0, [cq+16* 1], 1 - vinserti128 m1, m1, [cq+16* 5], 1 + mova xm0,[cq+16* 0] + mova xm1, [cq+16* 4] + vinserti128 m0, [cq+16* 1], 1 + vinserti128 m1, [cq+16* 5], 1 pxor m8, m8 mova [cq+32*0], m8 mova [cq+32*2], m8 add cq, 16*16 - mova xm2, [cq-16* 8] - mova xm3, [cq-16* 4] - vinserti128 m2, m2, [cq-16* 7], 1 - vinserti128 m3, m3, [cq-16* 3], 1 - mova xm4, [cq+16* 0] - mova xm5, [cq+16* 4] - vinserti128 m4, m4, [cq+16* 1], 1 - vinserti128 m5, m5, [cq+16* 5], 1 - mova xm6, [cq+16* 8] - mova xm7, [cq+16*12] - vinserti128 m6, m6, [cq+16* 9], 1 - vinserti128 m7, m7, [cq+16*13], 1 + mova xm2, [cq-16* 8] + mova xm3, [cq-16* 4] + vinserti128 m2, [cq-16* 7], 1 + vinserti128 m3, [cq-16* 3], 1 + mova xm4, [cq+16* 0] + mova xm5, [cq+16* 4] + vinserti128 m4, [cq+16* 1], 1 + vinserti128 m5, [cq+16* 5], 1 + mova xm6, [cq+16* 8] + mova xm7, [cq+16*12] + vinserti128 m6, [cq+16* 9], 1 + vinserti128 m7, [cq+16*13], 1 REPX {mova [cq+32*x], m8}, -4, -2, 0, 2, 4, 6 REPX {paddsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 call .transpose8x8 @@ -4019,22 +3529,22 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob lea r5, [dstq+strideq*4] sub eobd, 107 .loop: - mova xm0, [cq-16*8] - mova xm1, [cq-16*7] - vinserti128 m0, m0, [cq+16*0], 1 - vinserti128 m1, m1, [cq+16*1], 1 - mova xm2, [cq-16*6] - mova xm3, [cq-16*5] - vinserti128 m2, m2, [cq+16*2], 1 - vinserti128 m3, m3, [cq+16*3], 1 - mova xm4, [cq-16*4] - mova xm5, [cq-16*3] - vinserti128 m4, m4, [cq+16*4], 1 - vinserti128 m5, m5, [cq+16*5], 1 - mova xm6, [cq-16*2] - mova xm7, [cq-16*1] - vinserti128 m6, m6, [cq+16*6], 1 - vinserti128 m7, m7, [cq+16*7], 1 + mova xm0, [cq-16*8] + mova xm1, [cq-16*7] + vinserti128 m0, [cq+16*0], 1 + vinserti128 m1, [cq+16*1], 1 + mova xm2, [cq-16*6] + mova xm3, [cq-16*5] + vinserti128 m2, [cq+16*2], 1 + vinserti128 m3, [cq+16*3], 1 + mova xm4, [cq-16*4] + mova xm5, [cq-16*3] + vinserti128 m4, [cq+16*4], 1 + vinserti128 m5, [cq+16*5], 1 + mova xm6, [cq-16*2] + mova xm7, [cq-16*1] + vinserti128 m6, [cq+16*6], 1 + vinserti128 m7, [cq+16*7], 1 pxor m8, m8 REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 @@ -4206,28 +3716,28 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob vextracti128 [r2+32*3+16], m14, 1 vinserti128 m8, m1, xm9, 1 vperm2i128 m12, m1, m9, 0x31 - mova xm0, [tmp1q-32*4] - mova xm1, [tmp1q-32*3] - vinserti128 m0, m0, [tmp1q+32*0], 1 - vinserti128 m1, m1, [tmp1q+32*1], 1 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp1q+32*0], 1 + vinserti128 m1, [tmp1q+32*1], 1 vinserti128 m10, m5, xm13, 1 vperm2i128 m14, m5, m13, 0x31 - mova xm4, [tmp1q-32*4+16] - mova xm5, [tmp1q-32*3+16] - vinserti128 m4, m4, [tmp1q+32*0+16], 1 - vinserti128 m5, m5, [tmp1q+32*1+16], 1 + mova xm4, [tmp1q-32*4+16] + mova xm5, [tmp1q-32*3+16] + vinserti128 m4, [tmp1q+32*0+16], 1 + vinserti128 m5, [tmp1q+32*1+16], 1 vinserti128 m9, m3, xm11, 1 vperm2i128 m13, m3, m11, 0x31 - mova xm2, [tmp1q-32*2] - mova xm3, [tmp1q-32*1] - vinserti128 m2, m2, [tmp1q+32*2], 1 - vinserti128 m3, m3, [tmp1q+32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp1q+32*2], 1 + vinserti128 m3, [tmp1q+32*3], 1 vinserti128 m11, m7, xm15, 1 vperm2i128 m15, m7, m15, 0x31 - mova xm6, [tmp1q-32*2+16] - mova xm7, [tmp1q-32*1+16] - vinserti128 m6, m6, [tmp1q+32*2+16], 1 - vinserti128 m7, m7, [tmp1q+32*3+16], 1 + mova xm6, [tmp1q-32*2+16] + mova xm7, [tmp1q-32*1+16] + vinserti128 m6, [tmp1q+32*2+16], 1 + vinserti128 m7, [tmp1q+32*3+16], 1 call .main_oddhalf LOAD_8ROWS_H r2-32*4, 32 .idct16: @@ -4475,7 +3985,7 @@ ALIGN function_align mova [tmp1q+32*(11-%2)], xm%2 vextracti128 [tmp2q+32*( 3-%1)], m%2, 1 vperm2i128 m%2, m%1, m%4, 0x31 - vinserti128 m%1, m%1, xm%4, 1 + vinserti128 m%1, xm%4, 1 %endmacro cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob @@ -4593,22 +4103,22 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob mov rax, cq paddw m11, m12, m12 ; pw_16384 .loop: - mova xm0, [cq+64* 0] - mova xm1, [cq+64* 1] - vinserti128 m0, m0, [cq+64* 8], 1 - vinserti128 m1, m1, [cq+64* 9], 1 - mova xm2, [cq+64* 2] - mova xm3, [cq+64* 3] - vinserti128 m2, m2, [cq+64*10], 1 - vinserti128 m3, m3, [cq+64*11], 1 - mova xm4, [cq+64* 4] - mova xm5, [cq+64* 5] - vinserti128 m4, m4, [cq+64*12], 1 - vinserti128 m5, m5, [cq+64*13], 1 - mova xm6, [cq+64* 6] - mova xm7, [cq+64* 7] - vinserti128 m6, m6, [cq+64*14], 1 - vinserti128 m7, m7, [cq+64*15], 1 + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 @@ -4661,22 +4171,22 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob mov r5, dstq mov rax, cq .loop: - mova xm0, [cq+32* 0] - mova xm1, [cq+32* 1] - vinserti128 m0, m0, [cq+32* 8], 1 - vinserti128 m1, m1, [cq+32* 9], 1 - mova xm2, [cq+32* 2] - mova xm3, [cq+32* 3] - vinserti128 m2, m2, [cq+32*10], 1 - vinserti128 m3, m3, [cq+32*11], 1 - mova xm4, [cq+32* 4] - mova xm5, [cq+32* 5] - vinserti128 m4, m4, [cq+32*12], 1 - vinserti128 m5, m5, [cq+32*13], 1 - mova xm6, [cq+32* 6] - mova xm7, [cq+32* 7] - vinserti128 m6, m6, [cq+32*14], 1 - vinserti128 m7, m7, [cq+32*15], 1 + mova xm0, [cq+32* 0] + mova xm1, [cq+32* 1] + vinserti128 m0, [cq+32* 8], 1 + vinserti128 m1, [cq+32* 9], 1 + mova xm2, [cq+32* 2] + mova xm3, [cq+32* 3] + vinserti128 m2, [cq+32*10], 1 + vinserti128 m3, [cq+32*11], 1 + mova xm4, [cq+32* 4] + mova xm5, [cq+32* 5] + vinserti128 m4, [cq+32*12], 1 + vinserti128 m5, [cq+32*13], 1 + mova xm6, [cq+32* 6] + mova xm7, [cq+32* 7] + vinserti128 m6, [cq+32*14], 1 + vinserti128 m7, [cq+32*15], 1 REPX {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7 REPX {paddsw x, x }, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 @@ -4864,22 +4374,22 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob mov r5, dstq lea rax, [cq+32] .loop: - mova xm0, [cq+64* 0] - mova xm1, [cq+64* 1] - vinserti128 m0, m0, [cq+64* 8], 1 - vinserti128 m1, m1, [cq+64* 9], 1 - mova xm2, [cq+64* 2] - mova xm3, [cq+64* 3] - vinserti128 m2, m2, [cq+64*10], 1 - vinserti128 m3, m3, [cq+64*11], 1 - mova xm4, [cq+64* 4] - mova xm5, [cq+64* 5] - vinserti128 m4, m4, [cq+64*12], 1 - vinserti128 m5, m5, [cq+64*13], 1 - mova xm6, [cq+64* 6] - mova xm7, [cq+64* 7] - vinserti128 m6, m6, [cq+64*14], 1 - vinserti128 m7, m7, [cq+64*15], 1 + mova xm0, [cq+64* 0] + mova xm1, [cq+64* 1] + vinserti128 m0, [cq+64* 8], 1 + vinserti128 m1, [cq+64* 9], 1 + mova xm2, [cq+64* 2] + mova xm3, [cq+64* 3] + vinserti128 m2, [cq+64*10], 1 + vinserti128 m3, [cq+64*11], 1 + mova xm4, [cq+64* 4] + mova xm5, [cq+64* 5] + vinserti128 m4, [cq+64*12], 1 + vinserti128 m5, [cq+64*13], 1 + mova xm6, [cq+64* 6] + mova xm7, [cq+64* 7] + vinserti128 m6, [cq+64*14], 1 + vinserti128 m7, [cq+64*15], 1 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 REPX {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7 WRITE_16X2 0, 1, 8, 0, strideq*0, strideq*1 @@ -5022,27 +4532,27 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob add eobd, 0x80000000 jnc .pass1_loop lea r2, [rsp+32*23] - mova xm0, [r2-32*4+ 0] - mova xm1, [r2-32*2+ 0] - vinserti128 m0, m0, [r2+32*0+ 0], 1 - vinserti128 m1, m1, [r2+32*2+ 0], 1 - mova xm2, [r2-32*4+16] - mova xm3, [r2-32*2+16] - vinserti128 m2, m2, [r2+32*0+16], 1 - vinserti128 m3, m3, [r2+32*2+16], 1 + mova xm0, [r2-32*4+ 0] + mova xm1, [r2-32*2+ 0] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m1, [r2+32*2+ 0], 1 + mova xm2, [r2-32*4+16] + mova xm3, [r2-32*2+16] + vinserti128 m2, [r2+32*0+16], 1 + vinserti128 m3, [r2+32*2+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14 test r7d, r7d jl .fast lea r3, [r2+32*8] - mova xm4, [r3-32*4+ 0] - mova xm5, [r3-32*2+ 0] - vinserti128 m4, m4, [r3+32*0+ 0], 1 - vinserti128 m5, m5, [r3+32*2+ 0], 1 - mova xm6, [r3-32*4+16] - mova xm7, [r3-32*2+16] - vinserti128 m6, m6, [r3+32*0+16], 1 - vinserti128 m7, m7, [r3+32*2+16], 1 + mova xm4, [r3-32*4+ 0] + mova xm5, [r3-32*2+ 0] + vinserti128 m4, [r3+32*0+ 0], 1 + vinserti128 m5, [r3+32*2+ 0], 1 + mova xm6, [r3-32*4+16] + mova xm7, [r3-32*2+16] + vinserti128 m6, [r3+32*0+16], 1 + vinserti128 m7, [r3+32*2+16], 1 .fast: mova [rsp], m8 lea tmp1q, [rsp+32*7] @@ -5065,26 +4575,26 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob mova [tmp1q+32*1], m13 mova [tmp1q+32*2], m14 mova [tmp1q+32*3], m15 - mova xm0, [r2-32*3+ 0] - mova xm1, [r2-32*1+ 0] - vinserti128 m0, m0, [r2+32*1+ 0], 1 - vinserti128 m1, m1, [r2+32*3+ 0], 1 - mova xm2, [r2-32*3+16] - mova xm3, [r2-32*1+16] - vinserti128 m2, m2, [r2+32*1+16], 1 - vinserti128 m3, m3, [r2+32*3+16], 1 + mova xm0, [r2-32*3+ 0] + mova xm1, [r2-32*1+ 0] + vinserti128 m0, [r2+32*1+ 0], 1 + vinserti128 m1, [r2+32*3+ 0], 1 + mova xm2, [r2-32*3+16] + mova xm3, [r2-32*1+16] + vinserti128 m2, [r2+32*1+16], 1 + vinserti128 m3, [r2+32*3+16], 1 pxor m4, m4 REPX {mova x, m4}, m5, m6, m7 test r7d, r7d jl .fast2 - mova xm4, [r3-32*3+ 0] - mova xm5, [r3-32*1+ 0] - vinserti128 m4, m4, [r3+32*1+ 0], 1 - vinserti128 m5, m5, [r3+32*3+ 0], 1 - mova xm6, [r3-32*3+16] - mova xm7, [r3-32*1+16] - vinserti128 m6, m6, [r3+32*1+16], 1 - vinserti128 m7, m7, [r3+32*3+16], 1 + mova xm4, [r3-32*3+ 0] + mova xm5, [r3-32*1+ 0] + vinserti128 m4, [r3+32*1+ 0], 1 + vinserti128 m5, [r3+32*3+ 0], 1 + mova xm6, [r3-32*3+16] + mova xm7, [r3-32*1+16] + vinserti128 m6, [r3+32*1+16], 1 + vinserti128 m7, [r3+32*3+16], 1 .fast2: add tmp1q, 32*8 lea tmp2q, [tmp1q+32*8] @@ -5093,53 +4603,53 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob vpbroadcastd m15, [o(pd_2048)] add tmp1q, 32*16 add tmp2q, 32*32 - mova xm0, [r2-32*4+ 0] - mova xm3, [r2-32*1+16] - vinserti128 m0, m0, [r2+32*0+ 0], 1 - vinserti128 m3, m3, [r2+32*3+16], 1 - mova xm4, [r2-32*4+16] - mova xm7, [r2-32*1+ 0] - vinserti128 m4, m4, [r2+32*0+16], 1 - vinserti128 m7, m7, [r2+32*3+ 0], 1 + mova xm0, [r2-32*4+ 0] + mova xm3, [r2-32*1+16] + vinserti128 m0, [r2+32*0+ 0], 1 + vinserti128 m3, [r2+32*3+16], 1 + mova xm4, [r2-32*4+16] + mova xm7, [r2-32*1+ 0] + vinserti128 m4, [r2+32*0+16], 1 + vinserti128 m7, [r2+32*3+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast3 add r3, 32*24 - mova xm1, [r3-32*1+16] - mova xm2, [r3-32*4+ 0] - vinserti128 m1, m1, [r3+32*3+16], 1 - vinserti128 m2, m2, [r3+32*0+ 0], 1 - mova xm5, [r3-32*1+ 0] - mova xm6, [r3-32*4+16] - vinserti128 m5, m5, [r3+32*3+ 0], 1 - vinserti128 m6, m6, [r3+32*0+16], 1 + mova xm1, [r3-32*1+16] + mova xm2, [r3-32*4+ 0] + vinserti128 m1, [r3+32*3+16], 1 + vinserti128 m2, [r3+32*0+ 0], 1 + mova xm5, [r3-32*1+ 0] + mova xm6, [r3-32*4+16] + vinserti128 m5, [r3+32*3+ 0], 1 + vinserti128 m6, [r3+32*0+16], 1 .fast3: add rax, o_idct64_offset call m(inv_txfm_add_dct_dct_16x64).main_part1 add rax, 8 add tmp1q, 32*8 sub tmp2q, 32*8 - mova xm0, [r2-32*2+ 0] - mova xm3, [r2-32*3+16] - vinserti128 m0, m0, [r2+32*2+ 0], 1 - vinserti128 m3, m3, [r2+32*1+16], 1 - mova xm4, [r2-32*2+16] - mova xm7, [r2-32*3+ 0] - vinserti128 m4, m4, [r2+32*2+16], 1 - vinserti128 m7, m7, [r2+32*1+ 0], 1 + mova xm0, [r2-32*2+ 0] + mova xm3, [r2-32*3+16] + vinserti128 m0, [r2+32*2+ 0], 1 + vinserti128 m3, [r2+32*1+16], 1 + mova xm4, [r2-32*2+16] + mova xm7, [r2-32*3+ 0] + vinserti128 m4, [r2+32*2+16], 1 + vinserti128 m7, [r2+32*1+ 0], 1 pxor m1, m1 REPX {mova x, m1}, m2, m5, m6 test r7d, r7d jl .fast4 - mova xm1, [r3-32*3+16] - mova xm2, [r3-32*2+ 0] - vinserti128 m1, m1, [r3+32*1+16], 1 - vinserti128 m2, m2, [r3+32*2+ 0], 1 - mova xm5, [r3-32*3+ 0] - mova xm6, [r3-32*2+16] - vinserti128 m5, m5, [r3+32*1+ 0], 1 - vinserti128 m6, m6, [r3+32*2+16], 1 + mova xm1, [r3-32*3+16] + mova xm2, [r3-32*2+ 0] + vinserti128 m1, [r3+32*1+16], 1 + vinserti128 m2, [r3+32*2+ 0], 1 + mova xm5, [r3-32*3+ 0] + mova xm6, [r3-32*2+16] + vinserti128 m5, [r3+32*1+ 0], 1 + vinserti128 m6, [r3+32*2+16], 1 .fast4: call m(inv_txfm_add_dct_dct_16x64).main_part1 call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2 @@ -5423,38 +4933,38 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob mov tmp2d, 4 .pass2_loop: lea r3, [tmp1q-32*8] - mova xm0, [r3 -32*4] - mova xm1, [r3 -32*3] - vinserti128 m0, m0, [tmp1q-32*4], 1 - vinserti128 m1, m1, [tmp1q-32*3], 1 - mova xm2, [r3 -32*2] - mova xm3, [r3 -32*1] - vinserti128 m2, m2, [tmp1q-32*2], 1 - vinserti128 m3, m3, [tmp1q-32*1], 1 - mova xm4, [r3 +32*0] - mova xm5, [r3 +32*1] - vinserti128 m4, m4, [tmp1q+32*0], 1 - vinserti128 m5, m5, [tmp1q+32*1], 1 - mova xm6, [r3 +32*2] - mova xm7, [r3 +32*3] - vinserti128 m6, m6, [tmp1q+32*2], 1 - vinserti128 m7, m7, [tmp1q+32*3], 1 - mova xm8, [r3 -32*4+16] - mova xm9, [r3 -32*3+16] - vinserti128 m8, m8, [tmp1q-32*4+16], 1 - vinserti128 m9, m9, [tmp1q-32*3+16], 1 - mova xm10, [r3 -32*2+16] - mova xm11, [r3 -32*1+16] - vinserti128 m10, m10, [tmp1q-32*2+16], 1 - vinserti128 m11, m11, [tmp1q-32*1+16], 1 - mova xm12, [r3 +32*0+16] - mova xm13, [r3 +32*1+16] - vinserti128 m12, m12, [tmp1q+32*0+16], 1 - vinserti128 m13, m13, [tmp1q+32*1+16], 1 - mova xm14, [r3 +32*2+16] - mova xm15, [r3 +32*3+16] - vinserti128 m14, m14, [tmp1q+32*2+16], 1 - vinserti128 m15, m15, [tmp1q+32*3+16], 1 + mova xm0, [r3 -32*4] + mova xm1, [r3 -32*3] + vinserti128 m0, [tmp1q-32*4], 1 + vinserti128 m1, [tmp1q-32*3], 1 + mova xm2, [r3 -32*2] + mova xm3, [r3 -32*1] + vinserti128 m2, [tmp1q-32*2], 1 + vinserti128 m3, [tmp1q-32*1], 1 + mova xm4, [r3 +32*0] + mova xm5, [r3 +32*1] + vinserti128 m4, [tmp1q+32*0], 1 + vinserti128 m5, [tmp1q+32*1], 1 + mova xm6, [r3 +32*2] + mova xm7, [r3 +32*3] + vinserti128 m6, [tmp1q+32*2], 1 + vinserti128 m7, [tmp1q+32*3], 1 + mova xm8, [r3 -32*4+16] + mova xm9, [r3 -32*3+16] + vinserti128 m8, [tmp1q-32*4+16], 1 + vinserti128 m9, [tmp1q-32*3+16], 1 + mova xm10, [r3 -32*2+16] + mova xm11, [r3 -32*1+16] + vinserti128 m10, [tmp1q-32*2+16], 1 + vinserti128 m11, [tmp1q-32*1+16], 1 + mova xm12, [r3 +32*0+16] + mova xm13, [r3 +32*1+16] + vinserti128 m12, [tmp1q+32*0+16], 1 + vinserti128 m13, [tmp1q+32*1+16], 1 + mova xm14, [r3 +32*2+16] + mova xm15, [r3 +32*3+16] + vinserti128 m14, [tmp1q+32*2+16], 1 + vinserti128 m15, [tmp1q+32*3+16], 1 mova [rsp+32*0], m6 mova [rsp+32*1], m7 vpbroadcastd m7, [o(pw_8192)] @@ -5810,48 +5320,48 @@ ALIGN function_align mov tmp3d, 4 .loop: lea tmp2q, [tmp1q+32*8] - mova xm0, [tmp1q-32*4] - mova xm1, [tmp1q-32*3] - vinserti128 m0, m0, [tmp2q-32*4], 1 - vinserti128 m1, m1, [tmp2q-32*3], 1 - mova xm2, [tmp1q-32*2] - mova xm3, [tmp1q-32*1] - vinserti128 m2, m2, [tmp2q-32*2], 1 - vinserti128 m3, m3, [tmp2q-32*1], 1 - mova xm4, [tmp1q+32*0] - mova xm5, [tmp1q+32*1] - vinserti128 m4, m4, [tmp2q+32*0], 1 - vinserti128 m5, m5, [tmp2q+32*1], 1 - mova xm6, [tmp1q+32*2] - mova xm7, [tmp1q+32*3] - vinserti128 m6, m6, [tmp2q+32*2], 1 - vinserti128 m7, m7, [tmp2q+32*3], 1 + mova xm0, [tmp1q-32*4] + mova xm1, [tmp1q-32*3] + vinserti128 m0, [tmp2q-32*4], 1 + vinserti128 m1, [tmp2q-32*3], 1 + mova xm2, [tmp1q-32*2] + mova xm3, [tmp1q-32*1] + vinserti128 m2, [tmp2q-32*2], 1 + vinserti128 m3, [tmp2q-32*1], 1 + mova xm4, [tmp1q+32*0] + mova xm5, [tmp1q+32*1] + vinserti128 m4, [tmp2q+32*0], 1 + vinserti128 m5, [tmp2q+32*1], 1 + mova xm6, [tmp1q+32*2] + mova xm7, [tmp1q+32*3] + vinserti128 m6, [tmp2q+32*2], 1 + vinserti128 m7, [tmp2q+32*3], 1 REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 call m(inv_txfm_add_identity_identity_8x32).transpose8x8 - mova xm8, [tmp1q-32*4+16] - mova xm9, [tmp1q-32*3+16] - vinserti128 m8, m8, [tmp2q-32*4+16], 1 - vinserti128 m9, m9, [tmp2q-32*3+16], 1 + mova xm8, [tmp1q-32*4+16] + mova xm9, [tmp1q-32*3+16] + vinserti128 m8, [tmp2q-32*4+16], 1 + vinserti128 m9, [tmp2q-32*3+16], 1 mova [tmp1q-32*4], m0 mova [tmp2q-32*4], m1 mova [tmp1q-32*3], m2 mova [tmp2q-32*3], m3 - mova xm2, [tmp1q-32*2+16] - mova xm3, [tmp1q-32*1+16] - vinserti128 m2, m2, [tmp2q-32*2+16], 1 - vinserti128 m3, m3, [tmp2q-32*1+16], 1 + mova xm2, [tmp1q-32*2+16] + mova xm3, [tmp1q-32*1+16] + vinserti128 m2, [tmp2q-32*2+16], 1 + vinserti128 m3, [tmp2q-32*1+16], 1 mova [tmp1q-32*2], m4 mova [tmp2q-32*2], m5 mova [tmp1q-32*1], m6 mova [tmp2q-32*1], m7 - mova xm4, [tmp1q+32*0+16] - mova xm5, [tmp1q+32*1+16] - vinserti128 m4, m4, [tmp2q+32*0+16], 1 - vinserti128 m5, m5, [tmp2q+32*1+16], 1 - mova xm6, [tmp1q+32*2+16] - mova xm7, [tmp1q+32*3+16] - vinserti128 m6, m6, [tmp2q+32*2+16], 1 - vinserti128 m7, m7, [tmp2q+32*3+16], 1 + mova xm4, [tmp1q+32*0+16] + mova xm5, [tmp1q+32*1+16] + vinserti128 m4, [tmp2q+32*0+16], 1 + vinserti128 m5, [tmp2q+32*1+16], 1 + mova xm6, [tmp1q+32*2+16] + mova xm7, [tmp1q+32*3+16] + vinserti128 m6, [tmp2q+32*2+16], 1 + vinserti128 m7, [tmp2q+32*3+16], 1 pmulhrsw m0, m8, m10 pmulhrsw m1, m9, m10 REPX {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7 diff --git a/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm index 9316981a5..91cf666b9 100644 --- a/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm +++ b/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm @@ -139,11 +139,6 @@ pw_2675x8: times 8 dw 2675*8 pw_4085x8: times 8 dw 4085*8 pw_m301x8: times 8 dw -301*8 -iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424 -iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568 -iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856 -iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424 - SECTION .text %macro REPX 2-* @@ -243,31 +238,24 @@ SECTION .text paddsw m0, m2 ;high: out1 ;low: out0 %endmacro -%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack -cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2 - %undef cmp - %define %%p1 m(i%1_%4_internal) +%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack +cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2 + %define %%p1 m(i%1_%3_internal) %if ARCH_X86_32 LEA r5, $$ %endif %if has_epilogue -%if %3 > 0 - cmp eobd, %3 - jle %%end -%elif %3 == 0 +%ifidn %1_%2, dct_dct test eobd, eobd jz %%end %endif - lea tx2q, [o(m(i%2_%4_internal).pass2)] + lea tx2q, [o(m(i%2_%3_internal).pass2)] call %%p1 RET %%end: %else - lea tx2q, [o(m(i%2_%4_internal).pass2)] -%if %3 > 0 - cmp eobd, %3 - jg %%p1 -%elif %3 == 0 + lea tx2q, [o(m(i%2_%3_internal).pass2)] +%ifidn %1_%2, dct_dct test eobd, eobd jnz %%p1 %else @@ -278,63 +266,26 @@ ALIGN function_align %endif %endmacro -%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x4, 6 -%ifidn %1_%2, dct_identity - mova m0, [o(pw_2896x8)] - pmulhrsw m0, [coeffq] - pmulhrsw m1, m0, [o(pw_1697x8)] - paddsw m0, m1 - punpcklwd m0, m0 - punpckhdq m1, m0, m0 - punpckldq m0, m0 - TAIL_CALL m(iadst_4x4_internal).end -%elifidn %1_%2, identity_dct - mova m1, [coeffq+16*0] - mova m2, [coeffq+16*1] - punpcklwd m0, m1, m2 - punpckhwd m1, m2 - punpcklwd m0, m1 - punpcklqdq m0, m0 - pmulhrsw m1, m0, [o(pw_1697x8)] - paddsw m0, m1 - pmulhrsw m0, [o(pw_2896x8)] - mova m1, m0 - TAIL_CALL m(iadst_4x4_internal).end -%elif %3 >= 0 +%macro INV_TXFM_4X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x4, 6 +%ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 -%ifidn %1, dct mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 -%elifidn %1, adst - pmulhrsw m0, [o(iadst4_dconly1a)] -%elifidn %1, flipadst - pmulhrsw m0, [o(iadst4_dconly1b)] -%endif mov [coeffq], eobd ;0 -%ifidn %2, dct -%ifnidn %1, dct - pmulhrsw m0, [o(pw_2896x8)] -%else pmulhrsw m0, m1 -%endif mova m1, m0 TAIL_CALL m(iadst_4x4_internal).end2 -%else ; adst / flipadst - pmulhrsw m1, m0, [o(iadst4_dconly2b)] - pmulhrsw m0, [o(iadst4_dconly2a)] - TAIL_CALL m(i%2_4x4_internal).end2 -%endif %endif %endmacro INIT_XMM ssse3 -INV_TXFM_4X4_FN dct, dct, 0 -INV_TXFM_4X4_FN dct, adst, 0 -INV_TXFM_4X4_FN dct, flipadst, 0 -INV_TXFM_4X4_FN dct, identity, 3 +INV_TXFM_4X4_FN dct, dct +INV_TXFM_4X4_FN dct, adst +INV_TXFM_4X4_FN dct, flipadst +INV_TXFM_4X4_FN dct, identity cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m0, [coeffq+16*0] ;high: in1 ;low: in0 @@ -358,9 +309,9 @@ cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX4_END 0, 1, 3, 2 -INV_TXFM_4X4_FN adst, dct, 0 -INV_TXFM_4X4_FN adst, adst, 0 -INV_TXFM_4X4_FN adst, flipadst, 0 +INV_TXFM_4X4_FN adst, dct +INV_TXFM_4X4_FN adst, adst +INV_TXFM_4X4_FN adst, flipadst INV_TXFM_4X4_FN adst, identity cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 @@ -410,9 +361,9 @@ ALIGN function_align packssdw m1, m2 ;high: out3 ;low: out3 ret -INV_TXFM_4X4_FN flipadst, dct, 0 -INV_TXFM_4X4_FN flipadst, adst, 0 -INV_TXFM_4X4_FN flipadst, flipadst, 0 +INV_TXFM_4X4_FN flipadst, dct +INV_TXFM_4X4_FN flipadst, adst +INV_TXFM_4X4_FN flipadst, flipadst INV_TXFM_4X4_FN flipadst, identity cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 @@ -436,7 +387,7 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 .end2: ITX4_END 3, 2, 1, 0 -INV_TXFM_4X4_FN identity, dct, 3 +INV_TXFM_4X4_FN identity, dct INV_TXFM_4X4_FN identity, adst INV_TXFM_4X4_FN identity, flipadst INV_TXFM_4X4_FN identity, identity @@ -595,39 +546,9 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff punpckhdq m3, m4 ;low: in6 high: in7 %endmacro -%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x8, 8 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1, [coeffq] - pmulhrsw m0, m1 - pmulhrsw m0, [o(pw_4096)] - punpckhwd m2, m0, m0 - punpcklwd m0, m0 - punpckhdq m1, m0, m0 - punpckldq m0, m0 - punpckhdq m3, m2, m2 - punpckldq m2, m2 - TAIL_CALL m(iadst_4x8_internal).end3 -%elifidn %1_%2, identity_dct - movd m0, [coeffq+16*0] - punpcklwd m0, [coeffq+16*1] - movd m1, [coeffq+16*2] - punpcklwd m1, [coeffq+16*3] - mova m2, [o(pw_2896x8)] - punpckldq m0, m1 - pmulhrsw m0, m2 - pmulhrsw m1, m0, [o(pw_1697x8)] - paddsw m0, m1 - pmulhrsw m0, m2 - pmulhrsw m0, [o(pw_2048)] - punpcklqdq m0, m0 - mova m1, m0 - mova m2, m0 - mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end3 -%elifidn %1_%2, dct_dct +%macro INV_TXFM_4X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x8, 8 +%ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] @@ -639,32 +560,14 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff mova m1, m0 mova m2, m0 mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end4 -%else ; adst_dct / flipadst_dct - pshuflw m0, [coeffq], q0000 - punpcklqdq m0, m0 - mova m1, [o(pw_2896x8)] - pmulhrsw m0, m1 -%ifidn %1, adst - pmulhrsw m0, [o(iadst4_dconly1a)] -%else ; flipadst - pmulhrsw m0, [o(iadst4_dconly1b)] -%endif - mov [coeffq], eobd - pmulhrsw m0, m1 - pmulhrsw m0, [o(pw_2048)] - mova m1, m0 - mova m2, m0 - mova m3, m0 - TAIL_CALL m(iadst_4x8_internal).end4 -%endif + TAIL_CALL m(iadst_4x8_internal).end3 %endif %endmacro -INV_TXFM_4X8_FN dct, dct, 0 -INV_TXFM_4X8_FN dct, identity, 7 +INV_TXFM_4X8_FN dct, dct INV_TXFM_4X8_FN dct, adst INV_TXFM_4X8_FN dct, flipadst +INV_TXFM_4X8_FN dct, identity cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] @@ -690,7 +593,7 @@ ALIGN function_align ret -INV_TXFM_4X8_FN adst, dct, 0 +INV_TXFM_4X8_FN adst, dct INV_TXFM_4X8_FN adst, adst INV_TXFM_4X8_FN adst, flipadst INV_TXFM_4X8_FN adst, identity @@ -725,15 +628,13 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pmulhrsw m1, m4 pmulhrsw m2, m4 pmulhrsw m3, m4 - -.end3: pxor m5, m5 mova [coeffq+16*0], m5 mova [coeffq+16*1], m5 mova [coeffq+16*2], m5 mova [coeffq+16*3], m5 -.end4: +.end3: WRITE_4X8 0, 1, 2, 3 RET @@ -783,7 +684,7 @@ ALIGN function_align packssdw m2, m4 ;low: out4 high: -out5 ret -INV_TXFM_4X8_FN flipadst, dct, 0 +INV_TXFM_4X8_FN flipadst, dct INV_TXFM_4X8_FN flipadst, adst INV_TXFM_4X8_FN flipadst, flipadst INV_TXFM_4X8_FN flipadst, identity @@ -824,7 +725,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 psubw m4, m5 jmp m(iadst_4x8_internal).end -INV_TXFM_4X8_FN identity, dct, 3 +INV_TXFM_4X8_FN identity, dct INV_TXFM_4X8_FN identity, adst INV_TXFM_4X8_FN identity, flipadst INV_TXFM_4X8_FN identity, identity @@ -881,84 +782,28 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 WRITE_8X2 %3, %4, %5, %6, %7 %endmacro -%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x4, 8 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - mova m0, [o(pw_2896x8)] - pmulhrsw m1, m0, [coeffq] - pmulhrsw m1, m0 - pmulhrsw m0, m1, [o(pw_1697x8)] - paddsw m1, m0 - pmulhrsw m1, [o(pw_2048)] - punpcklwd m1, m1 - punpckhdq m2, m1, m1 - punpckldq m1, m1 - punpckhdq m3, m2, m2 - punpckldq m2, m2 - punpckldq m0, m1, m1 - punpckhdq m1, m1 -%elifidn %1_%2, identity_dct - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - mova m2, [coeffq+16*2] - mova m3, [coeffq+16*3] - punpckhwd m4, m0, m1 - punpcklwd m0, m1 - punpckhwd m5, m2, m3 - punpcklwd m2, m3 - punpcklwd m0, m4 - punpcklwd m2, m5 - punpcklqdq m0, m2 - mova m4, [o(pw_2896x8)] - pmulhrsw m0, m4 - paddsw m0, m0 - pmulhrsw m0, m4 - pmulhrsw m0, [o(pw_2048)] - mova m1, m0 - mova m2, m0 - mova m3, m0 -%else +%macro INV_TXFM_8X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x4, 8 +%ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklqdq m0, m0 mova m1, [o(pw_2896x8)] pmulhrsw m0, m1 pmulhrsw m0, m1 -%ifidn %2, dct mova m2, [o(pw_2048)] pmulhrsw m0, m1 pmulhrsw m0, m2 mova m1, m0 mova m2, m0 mova m3, m0 -%else ; adst / flipadst - pmulhrsw m2, m0, [o(iadst4_dconly2b)] - pmulhrsw m0, [o(iadst4_dconly2a)] - mova m1, [o(pw_2048)] - pmulhrsw m0, m1 - pmulhrsw m2, m1 -%ifidn %2, adst - punpckhqdq m1, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m3, m2, m2 - punpcklqdq m2, m2 -%else ; flipadst - mova m3, m0 - punpckhqdq m0, m2, m2 - punpcklqdq m1, m2, m2 - punpckhqdq m2, m3, m3 - punpcklqdq m3, m3 -%endif -%endif -%endif TAIL_CALL m(iadst_8x4_internal).end2 %endif %endmacro -INV_TXFM_8X4_FN dct, dct, 0 -INV_TXFM_8X4_FN dct, adst, 0 -INV_TXFM_8X4_FN dct, flipadst, 0 -INV_TXFM_8X4_FN dct, identity, 3 +INV_TXFM_8X4_FN dct, dct +INV_TXFM_8X4_FN dct, adst +INV_TXFM_8X4_FN dct, flipadst +INV_TXFM_8X4_FN dct, identity cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, [o(pw_2896x8)] @@ -1157,7 +1002,7 @@ cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova m3, m4 jmp m(iadst_8x4_internal).end -INV_TXFM_8X4_FN identity, dct, 7 +INV_TXFM_8X4_FN identity, dct INV_TXFM_8X4_FN identity, adst INV_TXFM_8X4_FN identity, flipadst INV_TXFM_8X4_FN identity, identity @@ -1199,30 +1044,9 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 paddsw m3, m7 jmp m(iadst_8x4_internal).end -%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x8, 8, 16*4 -%ifidn %1_%2, dct_identity - mova m0, [o(pw_2896x8)] - pmulhrsw m0, [coeffq] - mova m1, [o(pw_16384)] - pmulhrsw m0, m1 - psrlw m1, 2 - pmulhrsw m0, m1 - punpckhwd m7, m0, m0 - punpcklwd m0, m0 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - call m(iadst_8x4_internal).end2 - pshufd m3, m7, q3333 - pshufd m2, m7, q2222 - pshufd m1, m7, q1111 - pshufd m0, m7, q0000 - lea dstq, [dstq+strideq*2] - TAIL_CALL m(iadst_8x4_internal).end3 -%elif %3 >= 0 -%ifidn %1, dct +%macro INV_TXFM_8X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x8, 8, 16*4 +%ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] @@ -1244,24 +1068,6 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .end3: RET -%else ; identity - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - mova m2, [coeffq+16*2] - mova m3, [coeffq+16*3] - punpcklwd m0, [coeffq+16*4] - punpcklwd m1, [coeffq+16*5] - punpcklwd m2, [coeffq+16*6] - punpcklwd m3, [coeffq+16*7] - punpcklwd m0, m2 - punpcklwd m1, m3 - punpcklwd m0, m1 - pmulhrsw m0, [o(pw_2896x8)] - pmulhrsw m0, [o(pw_2048)] - pxor m4, m4 - REPX {mova [coeffq+16*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7 - jmp m(inv_txfm_add_dct_dct_8x8).end -%endif %endif %endmacro @@ -1298,10 +1104,10 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ITX_MULSUB_2W %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6 %endmacro -INV_TXFM_8X8_FN dct, dct, 0 -INV_TXFM_8X8_FN dct, identity, 7 +INV_TXFM_8X8_FN dct, dct INV_TXFM_8X8_FN dct, adst INV_TXFM_8X8_FN dct, flipadst +INV_TXFM_8X8_FN dct, identity cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq, 16 @@ -1610,7 +1416,7 @@ ALIGN function_align mova [rsp+gprsize+16*0], m7 jmp m(idct_8x8_internal).end3 -INV_TXFM_8X8_FN identity, dct, 7 +INV_TXFM_8X8_FN identity, dct INV_TXFM_8X8_FN identity, adst INV_TXFM_8X8_FN identity, flipadst INV_TXFM_8X8_FN identity, identity @@ -1634,58 +1440,9 @@ ALIGN function_align jmp m(idct_8x8_internal).end3 -%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 4x16, 8 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - mova m0, [o(pw_2896x8)] - mova m1, m0 - pmulhrsw m0, [coeffq+16*0] - pmulhrsw m1, [coeffq+16*1] - mova m2, [o(pw_16384)] - mova m3, [o(pw_1697x16)] - mova m4, [o(pw_2048)] - pmulhrsw m0, m2 - pmulhrsw m1, m2 - pmulhrsw m2, m3, m0 - pmulhrsw m3, m1 - paddsw m0, m0 - paddsw m1, m1 - paddsw m0, m2 - paddsw m1, m3 - pmulhrsw m0, m4 - pmulhrsw m4, m1 - punpckhwd m2, m0, m0 - punpcklwd m0, m0 - punpckhwd m6, m4, m4 - punpcklwd m4, m4 - punpckhdq m1, m0, m0 - punpckldq m0, m0 - punpckhdq m3, m2, m2 - punpckldq m2, m2 - punpckhdq m5, m4, m4 - punpckldq m4, m4 - punpckhdq m7, m6, m6 - punpckldq m6, m6 - mova [coeffq+16*4], m4 - TAIL_CALL m(iadst_4x16_internal).end2 -%elifidn %1_%2, identity_dct - movd m0, [coeffq+32*0] - punpcklwd m0, [coeffq+32*1] - movd m1, [coeffq+32*2] - punpcklwd m1, [coeffq+32*3] - punpckldq m0, m1 - pmulhrsw m1, m0, [o(pw_1697x8)] - pcmpeqw m2, m2 - pcmpeqw m2, m0 - pxor m0, m2 - pavgw m0, m1 - pmulhrsw m0, [o(pw_2896x8)] - pmulhrsw m0, [o(pw_2048)] - punpcklqdq m0, m0 - pxor m1, m1 - REPX {mova [coeffq+32*x], m1}, 0, 1, 2, 3 -%elifidn %1_%2, dct_dct +%macro INV_TXFM_4X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 4x16, 8 +%ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 mova m1, [o(pw_2896x8)] @@ -1694,21 +1451,6 @@ ALIGN function_align pmulhrsw m0, [o(pw_16384)] pmulhrsw m0, m1 pmulhrsw m0, [o(pw_2048)] -%else ; adst_dct / flipadst_dct - pshuflw m0, [coeffq], q0000 - punpcklwd m0, m0 -%ifidn %1, adst - pmulhrsw m0, [o(iadst4_dconly1a)] -%else ; flipadst - pmulhrsw m0, [o(iadst4_dconly1b)] -%endif - mova m1, [o(pw_16384)] - mov [coeffq], eobd - pmulhrsw m0, m1 - psrlw m1, 3 ; pw_2048 - pmulhrsw m0, [o(pw_2896x8)] - pmulhrsw m0, m1 -%endif .end: WRITE_4X4 0, 0, 1, 2, 3, 0, 1, 2, 3 lea dstq, [dstq+strideq*4] @@ -1721,10 +1463,10 @@ ALIGN function_align %endif %endmacro -INV_TXFM_4X16_FN dct, dct, 0 -INV_TXFM_4X16_FN dct, identity, 15 +INV_TXFM_4X16_FN dct, dct INV_TXFM_4X16_FN dct, adst INV_TXFM_4X16_FN dct, flipadst +INV_TXFM_4X16_FN dct, identity cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_4x8_internal).pass1)] @@ -1790,7 +1532,7 @@ cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 REPX {mova [r3+16*x], m7}, 0, 1, 2, 3, 4, 5, 6, 7 ret -INV_TXFM_4X16_FN adst, dct, 0 +INV_TXFM_4X16_FN adst, dct INV_TXFM_4X16_FN adst, adst INV_TXFM_4X16_FN adst, flipadst INV_TXFM_4X16_FN adst, identity @@ -1858,7 +1600,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 ret -INV_TXFM_4X16_FN flipadst, dct, 0 +INV_TXFM_4X16_FN flipadst, dct INV_TXFM_4X16_FN flipadst, adst INV_TXFM_4X16_FN flipadst, flipadst INV_TXFM_4X16_FN flipadst, identity @@ -1888,7 +1630,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_4x16_internal).end1 -INV_TXFM_4X16_FN identity, dct, 3 +INV_TXFM_4X16_FN identity, dct INV_TXFM_4X16_FN identity, adst INV_TXFM_4X16_FN identity, flipadst INV_TXFM_4X16_FN identity, identity @@ -1964,68 +1706,11 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iadst_4x16_internal).end2 -%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x4, 8 -%if %3 >= 0 -%ifidn %1_%2, dct_identity - mova m3, [o(pw_2896x8)] - pmulhrsw m3, [coeffq] - mova m0, [o(pw_16384)] - pmulhrsw m3, m0 - psrlw m0, 3 ; pw_2048 - pmulhrsw m1, m3, [o(pw_1697x8)] - paddsw m3, m1 - pmulhrsw m3, m0 - punpcklwd m3, m3 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - lea tx2q, [dstq+8] - call m(iadst_8x4_internal).end2 - add coeffq, 16*4 - mov dstq, tx2q - TAIL_CALL m(iadst_8x4_internal).end2 -%elifidn %1_%2, identity_dct - mova m4, [o(pw_1697x16)] - mova m5, [o(pw_16384)] - mova m6, [o(pw_2896x8)] - mov r3d, 2 - psrlw m7, m5, 3 ; pw_2048 -.main_loop: - mova m0, [coeffq+16*0] - mova m1, [coeffq+16*1] - punpckhwd m2, m0, m1 - punpcklwd m0, m1 - punpcklwd m0, m2 - mova m1, [coeffq+16*2] - mova m2, [coeffq+16*3] - punpckhwd m3, m1, m2 - punpcklwd m1, m2 - punpcklwd m1, m3 - punpcklqdq m0, m1 - pmulhrsw m1, m4, m0 - pmulhrsw m1, m5 - paddsw m0, m1 - pmulhrsw m0, m6 - pmulhrsw m0, m7 -.end: - pxor m3, m3 - mova [coeffq+16*0], m3 - mova [coeffq+16*1], m3 - mova [coeffq+16*2], m3 - mova [coeffq+16*3], m3 - add coeffq, 16*4 - lea tx2q, [dstq+8] - WRITE_8X4 0, 0, 0, 0, 1, 2, 3 - mov dstq, tx2q - dec r3d - jg .main_loop - RET -%else +%macro INV_TXFM_16X4_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x4, 8 +%ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] -%ifidn %2, dct movd m2, [o(pw_16384)] mov [coeffq], eobd mov r2d, 2 @@ -2059,35 +1744,6 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp tx2q .end: RET -%else ; adst / flipadst - movd m2, [o(pw_16384)] - pmulhrsw m0, m2 - pshuflw m0, m0, q0000 - punpcklwd m0, m0 - mov [coeffq], eobd - pmulhrsw m2, m0, [o(iadst4_dconly2b)] - pmulhrsw m0, [o(iadst4_dconly2a)] - mova m1, [o(pw_2048)] - pmulhrsw m0, m1 - pmulhrsw m2, m1 -%ifidn %2, adst - punpckhqdq m1, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m3, m2, m2 - punpcklqdq m2, m2 -%else ; flipadst - mova m3, m0 - punpckhqdq m0, m2, m2 - punpcklqdq m1, m2, m2 - punpckhqdq m2, m3, m3 - punpcklqdq m3, m3 -%endif - lea tx2q, [dstq+8] - call m(iadst_8x4_internal).end3 - mov dstq, tx2q - TAIL_CALL m(iadst_8x4_internal).end3 -%endif -%endif %endif %endmacro @@ -2144,10 +1800,10 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 punpcklqdq m%1, m%6 ;low: t8a high: t9 %endmacro -INV_TXFM_16X4_FN dct, dct, 0 -INV_TXFM_16X4_FN dct, adst, 0 -INV_TXFM_16X4_FN dct, flipadst, 0 -INV_TXFM_16X4_FN dct, identity, 3 +INV_TXFM_16X4_FN dct, dct +INV_TXFM_16X4_FN dct, adst +INV_TXFM_16X4_FN dct, flipadst +INV_TXFM_16X4_FN dct, identity cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_7ROWS coeffq, 16 @@ -2464,7 +2120,7 @@ cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(idct_16x4_internal).pass2_end -INV_TXFM_16X4_FN identity, dct, 15 +INV_TXFM_16X4_FN identity, dct INV_TXFM_16X4_FN identity, adst INV_TXFM_16X4_FN identity, flipadst INV_TXFM_16X4_FN identity, identity @@ -2537,8 +2193,8 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 mova [%1+%2*7], m7 %endmacro -%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 8x16, 8, 16*16 +%macro INV_TXFM_8X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 8x16, 8, 16*16 %ifidn %1_%2, dct_dct pshuflw m0, [coeffq], q0000 punpcklwd m0, m0 @@ -2556,78 +2212,13 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(inv_txfm_add_dct_dct_8x8).loop .end: RET -%elifidn %1_%2, dct_identity - mov r3d, 2 -.loop: - mova m0, [o(pw_2896x8)] - pmulhrsw m7, m0, [coeffq] - mova m1, [o(pw_16384)] - pxor m2, m2 - mova [coeffq], m2 - pmulhrsw m7, m0 - pmulhrsw m7, m1 - psrlw m1, 3 ; pw_2048 - pmulhrsw m0, m7, [o(pw_1697x16)] - paddsw m7, m7 - paddsw m7, m0 - pmulhrsw m7, m1 - punpcklwd m0, m7, m7 - punpckhwd m7, m7 - pshufd m3, m0, q3333 - pshufd m2, m0, q2222 - pshufd m1, m0, q1111 - pshufd m0, m0, q0000 - call m(iadst_8x4_internal).end3 - pshufd m3, m7, q3333 - pshufd m2, m7, q2222 - pshufd m1, m7, q1111 - pshufd m0, m7, q0000 - lea dstq, [dstq+strideq*2] - call m(iadst_8x4_internal).end3 - - add coeffq, 16 - lea dstq, [dstq+strideq*2] - dec r3d - jg .loop - RET -%elifidn %1_%2, identity_dct - movd m0, [coeffq+32*0] - punpcklwd m0, [coeffq+32*1] - movd m2, [coeffq+32*2] - punpcklwd m2, [coeffq+32*3] - add coeffq, 32*4 - movd m1, [coeffq+32*0] - punpcklwd m1, [coeffq+32*1] - movd m3, [coeffq+32*2] - punpcklwd m3, [coeffq+32*3] - mova m4, [o(pw_2896x8)] - xor eobd, eobd - mov [coeffq-32*4], eobd - mov [coeffq-32*3], eobd - mov [coeffq-32*2], eobd - mov [coeffq-32*1], eobd - punpckldq m0, m2 - punpckldq m1, m3 - punpcklqdq m0, m1 - pmulhrsw m0, m4 - pmulhrsw m0, m4 - pmulhrsw m0, [o(pw_2048)] - mov [coeffq+32*0], eobd - mov [coeffq+32*1], eobd - mov [coeffq+32*2], eobd - mov [coeffq+32*3], eobd - mov r3d, 4 - lea tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)] - jmp m(inv_txfm_add_dct_dct_8x8).loop -.end: - RET %endif %endmacro -INV_TXFM_8X16_FN dct, dct, 0 -INV_TXFM_8X16_FN dct, identity, 15 +INV_TXFM_8X16_FN dct, dct INV_TXFM_8X16_FN dct, adst INV_TXFM_8X16_FN dct, flipadst +INV_TXFM_8X16_FN dct, identity cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 lea r3, [o(m(idct_8x8_internal).pass1)] @@ -2790,7 +2381,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iflipadst_8x8_internal).end -INV_TXFM_8X16_FN identity, dct, 7 +INV_TXFM_8X16_FN identity, dct INV_TXFM_8X16_FN identity, adst INV_TXFM_8X16_FN identity, flipadst INV_TXFM_8X16_FN identity, identity @@ -2837,8 +2428,8 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp .end -%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x8, 8, 16*16 +%macro INV_TXFM_16X8_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x8, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] @@ -2850,83 +2441,13 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(inv_txfm_add_dct_dct_16x4).dconly .end: RET -%elifidn %1_%2, dct_identity - mova m7, [coeffq] - mova m0, [o(pw_2896x8)] - mova m1, [o(pw_16384)] - pxor m2, m2 - mova [coeffq], m2 - pmulhrsw m7, m0 - pmulhrsw m7, m0 - pmulhrsw m7, m1 - psrlw m1, 2 ; pw_4096 - pmulhrsw m7, m1 - punpcklwd m3, m7, m7 - punpckhwd m7, m7 - pshufd m0, m3, q0000 - pshufd m1, m3, q1111 - pshufd m2, m3, q2222 - pshufd m3, m3, q3333 - lea r3, [dstq+strideq*4] - lea tx2q, [dstq+8] - call m(iadst_8x4_internal).end2 - add coeffq, 16*4 - mov dstq, tx2q - call m(iadst_8x4_internal).end2 - mov dstq, r3 - add coeffq, 16*4 - pshufd m0, m7, q0000 - pshufd m1, m7, q1111 - pshufd m2, m7, q2222 - pshufd m3, m7, q3333 - lea tx2q, [dstq+8] - call m(iadst_8x4_internal).end2 - add coeffq, 16*4 - mov dstq, tx2q - TAIL_CALL m(iadst_8x4_internal).end2 -%elifidn %1_%2, identity_dct - mova m4, [o(pw_2896x8)] - mova m5, [o(pw_1697x16)] - mova m6, [o(pw_16384)] - psrlw m7, m6, 3 ; pw_2048 - mov r3d, 2 -.main_loop: - mova m0, [coeffq+16*0] - punpcklwd m0, [coeffq+16*1] - mova m1, [coeffq+16*2] - punpcklwd m1, [coeffq+16*3] - punpckldq m0, m1 - mova m1, [coeffq+16*4] - punpcklwd m1, [coeffq+16*5] - mova m2, [coeffq+16*6] - punpcklwd m2, [coeffq+16*7] - punpckldq m1, m2 - punpcklqdq m0, m1 - pmulhrsw m0, m4 - pmulhrsw m1, m5, m0 - pmulhrsw m1, m6 - paddsw m0, m1 - pmulhrsw m0, m4 - pmulhrsw m0, m7 -.end: - pxor m1, m1 - REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7 - add coeffq, 16*8 - lea tx2q, [dstq+8] - WRITE_8X4 0, 0, 0, 0, 1, 2, 3 - lea dstq, [dstq+strideq*2] - WRITE_8X4 0, 0, 0, 0, 1, 2, 3 - mov dstq, tx2q - dec r3d - jg .main_loop - RET %endif %endmacro -INV_TXFM_16X8_FN dct, dct, 0 -INV_TXFM_16X8_FN dct, identity, 7 +INV_TXFM_16X8_FN dct, dct INV_TXFM_16X8_FN dct, adst INV_TXFM_16X8_FN dct, flipadst +INV_TXFM_16X8_FN dct, identity cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*0, 32, 1 @@ -3382,7 +2903,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iflipadst_8x8_internal).pass2_main -INV_TXFM_16X8_FN identity, dct, 15 +INV_TXFM_16X8_FN identity, dct INV_TXFM_16X8_FN identity, adst INV_TXFM_16X8_FN identity, flipadst INV_TXFM_16X8_FN identity, identity @@ -3463,8 +2984,8 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(iidentity_8x8_internal).end -%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh - INV_TXFM_FN %1, %2, %3, 16x16, 8, 16*16 +%macro INV_TXFM_16X16_FN 2 ; type1, type2 + INV_TXFM_FN %1, %2, 16x16, 8, 16*16 %ifidn %1_%2, dct_dct movd m1, [o(pw_2896x8)] pmulhrsw m0, m1, [coeffq] @@ -3475,104 +2996,13 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 jmp m(inv_txfm_add_dct_dct_16x4).dconly .end: RET -%elifidn %1_%2, dct_identity - mova m3, [o(pw_2896x8)] - pmulhrsw m2, m3, [coeffq+16*0] - pmulhrsw m3, [coeffq+16*1] - mova m0, [o(pw_8192)] - mova m1, [o(pw_1697x16)] - pshuflw m4, [o(deint_shuf)], q0000 ;pb_0_1 - punpcklwd m4, m4 - pcmpeqb m5, m5 - pxor m6, m6 - mova [coeffq+16*0], m6 - mova [coeffq+16*1], m6 - paddb m5, m5 ;pb_m2 - pmulhrsw m2, m0 - pmulhrsw m3, m0 - psrlw m0, 2 ;pw_2048 - pmulhrsw m7, m1, m2 - pmulhrsw m1, m3 - paddsw m2, m2 - paddsw m3, m3 - paddsw m2, m7 - paddsw m3, m1 - pmulhrsw m2, m0 - pmulhrsw m3, m0 - mov r3d, 8 -.loop: - mova m1, [dstq] - pshufb m0, m2, m4 - punpckhbw m7, m1, m6 - punpcklbw m1, m6 - paddw m7, m0 - paddw m1, m0 - packuswb m1, m7 - mova [dstq], m1 - mova m1, [dstq+strideq*8] - pshufb m0, m3, m4 - psubb m4, m5 ; += 2 - punpckhbw m7, m1, m6 - punpcklbw m1, m6 - paddw m7, m0 - paddw m1, m0 - packuswb m1, m7 - mova [dstq+strideq*8], m1 - add dstq, strideq - dec r3d - jg .loop - RET -%elifidn %1_%2, identity_dct - mova m4, [o(pw_1697x16)] - mova m5, [o(pw_2896x8)] - mova m6, [o(pw_2048)] - xor eobd, eobd - lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)] - lea r3, [dstq+8] - mov [rsp+16*0], r3 -.main: - movd m0, [coeffq+32*0] - punpcklwd m0, [coeffq+32*1] - movd m1, [coeffq+32*2] - punpcklwd m1, [coeffq+32*3] - add coeffq, 32*4 - punpckldq m0, m1 - movd m1, [coeffq+32*0] - punpcklwd m1, [coeffq+32*1] - movd m2, [coeffq+32*2] - punpcklwd m2, [coeffq+32*3] - xor eobd, eobd - mov [coeffq-32*4], eobd - mov [coeffq-32*3], eobd - mov [coeffq-32*2], eobd - mov [coeffq-32*1], eobd - punpckldq m1, m2 - punpcklqdq m0, m1 - pmulhrsw m1, m4, m0 - psraw m1, 1 - pavgw m0, m1 - pmulhrsw m0, m5 - pmulhrsw m0, m6 - mov [coeffq+32*0], eobd - mov [coeffq+32*1], eobd - mov [coeffq+32*2], eobd - mov [coeffq+32*3], eobd - mov r3d, 4 - jmp m(inv_txfm_add_dct_dct_8x8).loop -.end: - lea tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end1)] - add coeffq, 32*4 - mov dstq, [rsp+16*0] - jmp .main -.end1: - RET %endif %endmacro -INV_TXFM_16X16_FN dct, dct, 0 -INV_TXFM_16X16_FN dct, identity, 15 +INV_TXFM_16X16_FN dct, dct INV_TXFM_16X16_FN dct, adst INV_TXFM_16X16_FN dct, flipadst +INV_TXFM_16X16_FN dct, identity cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 LOAD_8ROWS coeffq+16*1, 64 @@ -3865,7 +3295,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 pavgw m%1, m%2 %endmacro -INV_TXFM_16X16_FN identity, dct, 15 +INV_TXFM_16X16_FN identity, dct INV_TXFM_16X16_FN identity, identity cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2 diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm b/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm index e2acae7b2..3e3c35c34 100644 --- a/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm +++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm @@ -51,9 +51,12 @@ cextern sgr_x_by_x SECTION .text INIT_YMM avx2 -cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge +cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge + mov edged, edgem vpbroadcastb m15, [fhq+0] + movifnidn wd, wm vpbroadcastb m14, [fhq+2] + mov hd, hm vpbroadcastb m13, [fhq+4] vpbroadcastw m12, [fhq+6] vpbroadcastd m11, [pw_2048] @@ -64,7 +67,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge ; if (edge & has_right) align_w_to_32 ; else w -= 32, and use that as limit in x loop - test edged, 2 ; has_right + test edgeb, 2 ; has_right jnz .align mov xlimq, -3 jmp .loop @@ -80,7 +83,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge lea xq, [wq+xlimq] ; load left edge pixels - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .emu_left test leftq, leftq ; left == NULL for the edge-extended bottom/top jz .load_left_combined @@ -169,14 +172,21 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge paddw m2, m4 paddw m0, m6 paddw m2, m5 - paddsw m0, m8 + ; for a signed overflow to happen we need filter and pixels as follow: + ; filter => -5,-23,-17,90,-17,-23,-5 + ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0 + ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6] + ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84] + ; 32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A] + ; => signed 16-bit overflow occurs + paddsw m0, m8 ; paddsw clips this range to [-8000;+7FFF] paddsw m2, m3 - psraw m0, 3 + psraw m0, 3 ; shift changes the range to [-1000;+FFF] psraw m2, 3 - paddw m0, m11 - paddw m2, m11 - mova [dstptrq], xm0 - mova [dstptrq+16], xm2 + paddw m0, m11 ; adding back 800 (removed in m8) changes the + paddw m2, m11 ; range to [-800;+17FF] as defined in the spec + mova [dstptrq], xm0 ; (note that adding another 800 would give us + mova [dstptrq+16], xm2; the same range as in the C code => [0;1FFF]) vextracti128 [dstptrq+32], m0, 1 vextracti128 [dstptrq+48], m2, 1 vextracti128 xm0, m1, 1 @@ -196,17 +206,19 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge jg .loop RET -cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge - vpbroadcastd m14, [fvq+4] - vpbroadcastd m15, [fvq] - vpbroadcastd m13, [pw_0_128] - paddw m14, m13 +cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge + movifnidn fvq, fvmp + mov edged, edgem + movifnidn hd, hm + vpbroadcastd m10, [fvq] + vpbroadcastd m11, [fvq+4] + vpbroadcastd m0, [pw_0_128] vpbroadcastd m12, [pd_1024] DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + paddw m11, m0 + and ylimd, 2 ; have_bottom sub ylimd, 3 ; main x loop for vertical filter, does one column of 16 pixels @@ -214,7 +226,7 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge mova m3, [midq] ; middle line ; load top pixels - test edged, 4 ; have_top + test edgeb, 4 ; have_top jz .emu_top mova m0, [midq-384*4] mova m2, [midq-384*2] @@ -269,27 +281,28 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge ; try to structure the loop so that the common case is evaluated fastest mova m6, [mptrq+384*6] .loop: - paddw m7, m0, m6 - paddw m8, m1, m5 - paddw m9, m2, m4 - punpcklwd m10, m7, m8 - punpckhwd m7, m8 - punpcklwd m11, m9, m3 - punpckhwd m9, m3 - pmaddwd m10, m15 - pmaddwd m7, m15 - pmaddwd m11, m14 - pmaddwd m9, m14 - paddd m10, m11 + paddw m0, m6 + paddw m7, m1, m5 + paddw m8, m2, m4 + punpcklwd m9, m0, m7 + punpckhwd m0, m7 + punpcklwd m7, m8, m3 + punpckhwd m8, m3 + pmaddwd m9, m10 + pmaddwd m0, m10 + pmaddwd m7, m11 + pmaddwd m8, m11 + add mptrq, 384*2 paddd m7, m9 - paddd m10, m12 + paddd m0, m8 paddd m7, m12 - psrad m10, 11 + paddd m0, m12 psrad m7, 11 - packssdw m10, m7 - packuswb m10, m10 - vpermq m10, m10, q3120 - mova [dstptrq], xm10 + psrad m0, 11 + packssdw m7, m0 + vextracti128 xm0, m7, 1 + packuswb xm7, xm0 + mova [dstptrq], xm7 ; shift pixels one position mova m0, m1 mova m1, m2 @@ -298,51 +311,51 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge mova m4, m5 mova m5, m6 add dstptrq, strideq - add mptrq, 384*2 dec yd jg .loop_load ; for the bottom pixels, continue using m6 (as extended edge) cmp yd, ylimd jg .loop - - add dstq, 16 add midq, 32 + add dstq, 16 sub wd, 16 jg .loop_x RET INIT_YMM avx2 -cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim - mov xlimd, edged +cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov xlimd, edgem + movifnidn wd, wm + mov hd, hm + mov edged, xlimd and xlimd, 2 ; have_right - add wd, xlimd - xor xlimd, 2 ; 2*!have_right - jnz .no_right - add wd, 15 + jz .no_right + add wd, 2+15 and wd, ~15 .no_right: + lea r10, [pb_right_ext_mask+32] + xor xlimd, 2 ; 2*!have_right pxor m1, m1 - lea srcq, [srcq+wq] + add srcq, wq lea sumq, [sumq+wq*2-2] lea sumsqq, [sumsqq+wq*4-4] neg wq - lea r10, [pb_right_ext_mask+32] .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main - pinsrw xm0, [leftq+2], 7 + vpbroadcastw xm0, [leftq+2] add leftq, 4 jmp .expand_x .no_left: vpbroadcastb xm0, [srcq+xq] jmp .expand_x .load_left_from_main: - pinsrw xm0, [srcq+xq-2], 7 + vpbroadcastw xm0, [srcq+xq-2] .expand_x: punpckhbw xm0, xm1 @@ -352,8 +365,8 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim .partial_load_and_extend: vpbroadcastb m3, [srcq-1] pmovzxbw m2, [srcq+xq] - punpcklbw m3, m1 movu m4, [r10+xq*2] + punpcklbw m3, m1 pand m2, m4 pandn m4, m3 por m2, m4 @@ -373,22 +386,21 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim punpcklwd m5, m3, m2 punpckhwd m6, m3, m2 paddw m3, m4 - punpcklwd m7, m4, m1 + punpcklwd m0, m4, m1 punpckhwd m4, m1 pmaddwd m5, m5 pmaddwd m6, m6 - pmaddwd m7, m7 + pmaddwd m0, m0 pmaddwd m4, m4 - paddd m5, m7 - paddd m6, m4 paddw m3, m2 + paddd m5, m0 + vextracti128 xm0, m2, 1 + paddd m6, m4 movu [sumq+xq*2], m3 - movu [sumsqq+xq*4+ 0], xm5 - movu [sumsqq+xq*4+16], xm6 + movu [sumsqq+xq*4+ 0], xm5 + movu [sumsqq+xq*4+16], xm6 vextracti128 [sumsqq+xq*4+32], m5, 1 vextracti128 [sumsqq+xq*4+48], m6, 1 - - vextracti128 xm0, m2, 1 add xq, 16 ; if x <= -16 we can reload more pixels @@ -411,25 +423,25 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim RET INIT_YMM avx2 -cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim +cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem mov xq, -2 - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom sub ylimd, 2 ; -2 if have_bottom=0, else 0 .loop_x: lea yd, [hq+ylimq+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m6, [sum_ptrq+(384+16)*2*1] mova m2, m0 mova m3, m1 mova m4, m0 mova m5, m1 - movu m6, [sum_ptrq+(384+16)*2*1] mova m7, m6 mova m8, m6 jmp .loop_y_noload @@ -543,8 +555,10 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s RET INIT_YMM avx2 -cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \ +cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm vpbroadcastd m15, [pw_16] xor xd, xd .loop_x: @@ -647,75 +661,83 @@ cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \ RET INIT_YMM avx2 -cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt - movd xm0, wtd - vpbroadcastw m0, xm0 - psllw m0, 4 +cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt +%ifidn wtd, wtm + shl wtd, 4 + movd xm5, wtd + vpbroadcastw m5, xm5 +%else + vpbroadcastw m5, wtm + mov hd, hm + psllw m5, 4 +%endif DEFINE_ARGS dst, stride, t, w, h, idx .loop_y: xor idxd, idxd .loop_x: - mova m1, [tq+idxq*2+ 0] - mova m4, [tq+idxq*2+32] + mova m0, [tq+idxq*2+ 0] + mova m1, [tq+idxq*2+32] pmovzxbw m2, [dstq+idxq+ 0] - pmovzxbw m5, [dstq+idxq+16] - psllw m3, m2, 4 - psllw m6, m5, 4 - psubw m1, m3 - psubw m4, m6 - pmulhrsw m1, m0 - pmulhrsw m4, m0 - paddw m1, m2 - paddw m4, m5 - packuswb m1, m4 - vpermq m1, m1, q3120 - mova [dstq+idxq], m1 + pmovzxbw m3, [dstq+idxq+16] + psllw m4, m2, 4 + psubw m0, m4 + psllw m4, m3, 4 + psubw m1, m4 + pmulhrsw m0, m5 + pmulhrsw m1, m5 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, q3120 + mova [dstq+idxq], m0 add idxd, 32 cmp idxd, wd jl .loop_x + add tq, 384*2 add dstq, strideq - add tq, 384 * 2 dec hd jg .loop_y RET INIT_YMM avx2 -cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim - test edged, 2 ; have_right +cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim + mov edged, edgem + movifnidn wd, wm + mov hd, hm + test edgeb, 2 ; have_right jz .no_right xor xlimd, xlimd - add wd, 2 - add wd, 15 + add wd, 2+15 and wd, ~15 jmp .right_done .no_right: mov xlimd, 3 sub wd, 1 .right_done: + lea r10, [pb_right_ext_mask+32] pxor m1, m1 lea srcq, [srcq+wq+1] lea sumq, [sumq+wq*2-2] lea sumsqq, [sumsqq+wq*4-4] neg wq - lea r10, [pb_right_ext_mask+32] .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main - movd xm0, [leftq] - pinsrd xm0, [srcq+xq-1], 1 - pslldq xm0, 11 + vpbroadcastd xm2, [leftq] + movd xm0, [srcq+xq-1] add leftq, 4 + palignr xm0, xm2, 1 jmp .expand_x .no_left: vpbroadcastb xm0, [srcq+xq-1] jmp .expand_x .load_left_from_main: - pinsrd xm0, [srcq+xq-4], 3 + vpbroadcastd xm0, [srcq+xq-4] .expand_x: punpckhbw xm0, xm1 @@ -727,8 +749,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli .partial_load_and_extend: vpbroadcastb m3, [srcq-1] pmovzxbw m2, [srcq+xq] - punpcklbw m3, m1 movu m4, [r10+xq*2] + punpcklbw m3, m1 pand m2, m4 pandn m4, m3 por m2, m4 @@ -768,8 +790,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli paddd m7, m9 paddd m3, m5 movu [sumq+xq*2], m0 - movu [sumsqq+xq*4+ 0], xm7 - movu [sumsqq+xq*4+16], xm3 + movu [sumsqq+xq*4+ 0], xm7 + movu [sumsqq+xq*4+16], xm3 vextracti128 [sumsqq+xq*4+32], m7, 1 vextracti128 [sumsqq+xq*4+48], m3, 1 @@ -788,35 +810,35 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli cmp xd, xlimd jl .right_extend + add srcq, strideq add sumsqq, (384+16)*4 add sumq, (384+16)*2 - add srcq, strideq dec hd jg .loop_y RET INIT_YMM avx2 -cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim +cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim + movifnidn edged, edgem mov xq, -2 - mov ylimd, edged - and ylimd, 8 ; have_bottom - shr ylimd, 2 + rorx ylimd, edged, 2 + and ylimd, 2 ; have_bottom sub ylimd, 3 ; -3 if have_bottom=0, else -1 .loop_x: lea yd, [hq+ylimq+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+32] + movu m10, [sum_ptrq+(384+16)*2*1] mova m2, m0 mova m3, m1 mova m4, m0 mova m5, m1 mova m6, m0 mova m7, m1 - movu m10, [sum_ptrq+(384+16)*2*1] mova m11, m10 mova m12, m10 mova m13, m10 @@ -826,10 +848,10 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, movu m1, [sumsq_ptrq-(384+16)*4*1+32] ; l3/4sq [right] movu m4, [sumsq_ptrq-(384+16)*4*0] ; l2sq [left] movu m5, [sumsq_ptrq-(384+16)*4*0+32] ; l2sq [right] - mova m2, m0 - mova m3, m1 movu m10, [sum_ptrq-(384+16)*2*1] ; l3/4 movu m12, [sum_ptrq-(384+16)*2*0] ; l2 + mova m2, m0 + mova m3, m1 mova m11, m10 .loop_y: movu m6, [sumsq_ptrq+(384+16)*4*1] ; l1sq [left] @@ -960,8 +982,10 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s RET INIT_YMM avx2 -cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \ +cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \ tmp_ptr, src_ptr, a_ptr, b_ptr, x, y + movifnidn wd, wm + mov hd, hm vpbroadcastd m9, [pw_5_6] vpbroadcastd m12, [pw_256] psrlw m11, m12, 1 ; pw_128 @@ -1077,8 +1101,10 @@ cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \ RET INIT_YMM avx2 -cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt - vpbroadcastd m0, [wtq] +cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt + movifnidn wd, wm + movifnidn hd, hm + vpbroadcastd m0, wtm vpbroadcastd m10, [pd_1024] DEFINE_ARGS dst, stride, t1, t2, w, h, idx .loop_y: diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c index a1b25a90c..b0201ce3d 100644 --- a/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c @@ -169,7 +169,7 @@ void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \ void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \ const coef *t1, const coef *t2, \ const int w, const int h, \ - const int16_t wt[2]); \ + const uint32_t wt); \ \ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ const pixel (*const left)[4], \ @@ -194,7 +194,7 @@ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \ w, h, dav1d_sgr_params[sgr_idx][2], edges); \ dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \ w, h, dav1d_sgr_params[sgr_idx][3], edges); \ - const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \ + const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \ dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \ } \ } diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm index df72d26ff..aaaea7835 100644 --- a/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm +++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm @@ -188,13 +188,13 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %define srcptrq srcq %define dstptrq dstq %define hd dword [esp+ 0] - %define edged dword [esp+12] + %define edgeb byte [esp+12] %define xlimd dword [esp+16] %endif ; if (edge & has_right) align_w_to_16 ; else w -= 3, and use that as limit in x loop - test edged, 2 ; has_right + test edgeb, 2 ; has_right jnz .align mov xlimd, -3 jmp .loop @@ -221,7 +221,7 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge %endif ; load left edge pixels - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .emu_left test leftq, leftq ; left == NULL for the edge-extended bottom/top jz .load_left_combined @@ -359,8 +359,8 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge paddw m2, m4 paddw m0, m3 paddw m2, m5 - paddsw m0, m8 - paddsw m2, m6 + paddsw m0, m8 ; see the avx2 for an explanation + paddsw m2, m6 ; of how the clipping works here psraw m0, 3 psraw m2, 3 paddw m0, m11 @@ -477,7 +477,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge DEFINE_ARGS dst, stride, mid, w, h, y, edge %define mptrq midq %define dstptrq dstq - %define edged dword [esp] + %define edgeb byte [esp] %endif ; main x loop for vertical filter, does one column of 16 pixels @@ -485,7 +485,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mova m3, [midq] ; middle line ; load top pixels - test edged, 4 ; have_top + test edgeb, 4 ; have_top jz .emu_top mova m0, [midq-384*4] mova m2, [midq-384*2] @@ -604,8 +604,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mova m3, m4 mova m4, m5 mova m5, m6 - add dstptrq, strideq add mptrq, 384*2 + add dstptrq, strideq dec yd jg .loop_load ; for the bottom pixels, continue using m6 (as extended edge) @@ -616,8 +616,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge mov midq, [esp+8] mov dstq, [esp+4] %endif - add dstq, 8 add midq, 16 + add dstq, 8 sub wd, 8 jg .loop_x RET @@ -679,7 +679,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim %define wq r0m %define xlimd r1m %define hd hmp - %define edged edgemp + %define edgeb byte edgem mov r6, edgem and r6, 2 ; have_right @@ -706,7 +706,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main @@ -795,11 +795,13 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim movifnidn edged, edgem %else -cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y +cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y %define sumsq_baseq dword [esp+0] %define sum_baseq dword [esp+4] %define ylimd dword [esp+8] %define m8 [esp+12] + mov edged, r4m + mov hd, r3m %endif mov xq, -2 %if ARCH_X86_64 @@ -812,7 +814,7 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y .loop_x: mov sumsqq, sumsq_baseq mov sumq, sum_baseq - lea yd, [hd+ylimd+2] + lea yd, [hq+ylimq+2] %else mov yd, edged and yd, 8 ; have_bottom @@ -824,12 +826,12 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y .loop_x: mov sumsqd, sumsq_baseq mov sumd, sum_baseq - lea yd, [hd+2] + lea yd, [hq+2] add yd, ylimd %endif lea sumsqq, [sumsqq+xq*4+4-(384+16)*4] lea sumq, [sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsqq+(384+16)*4*1] movu m1, [sumsqq+(384+16)*4*1+16] @@ -1180,10 +1182,10 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y psubd m3, [aq-(384+16)*4*2+16] ; a:ctr+bottom [second half] %endif + add srcq, strideq add aq, (384+16)*4 add bq, (384+16)*2 add tq, 384*2 - add srcq, strideq dec yd jg .loop_y add xd, 8 @@ -1237,7 +1239,7 @@ cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xli mova m11, [pb_0_1] %else cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge - %define edged edgemp + %define edgeb byte edgem %define wd xd %define wq wd %define wm r5m @@ -1249,7 +1251,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge %define m11 [PIC_sym(pb_0_1)] %endif - test edged, 2 ; have_right + test edgeb, 2 ; have_right jz .no_right xor xlimd, xlimd add wd, 2 @@ -1275,7 +1277,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge .loop_y: mov xq, wq ; load left - test edged, 1 ; have_left + test edgeb, 1 ; have_left jz .no_left test leftq, leftq jz .load_left_from_main @@ -1401,9 +1403,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge cmp xd, xlimd jl .right_extend + add srcq, strideq add sumsqq, (384+16)*4 add sumq, (384+16)*2 - add srcq, strideq dec hd jg .loop_y %if ARCH_X86_32 @@ -1434,7 +1436,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [hd+ylimd+2] lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] lea sum_ptrq, [ sumq+xq*2+2-(384+16)*2] - test edged, 4 ; have_top + test edgeb, 4 ; have_top jnz .load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+16] @@ -1520,7 +1522,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [ylimd+2] add yd, hm lea sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4] - test dword edgem, 4 ; have_top + test byte edgem, 4 ; have_top jnz .sumsq_load_top movu m0, [sumsq_ptrq+(384+16)*4*1] movu m1, [sumsq_ptrq+(384+16)*4*1+16] @@ -1582,7 +1584,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr lea yd, [ylimd+2] add yd, hm lea sum_ptrq, [sumq+xq*2+2-(384+16)*2] - test dword edgem, 4 ; have_top + test byte edgem, 4 ; have_top jnz .sum_load_top movu m0, [sum_ptrq+(384+16)*2*1] mova m1, m0 @@ -1882,7 +1884,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt movifnidn wd, wm - mov wtq, wtmp + movd m0, wtm %if ARCH_X86_64 movifnidn hd, hm mova m10, [pd_1024] @@ -1892,7 +1894,6 @@ cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt %define m10 [PIC_sym(pd_1024)] %define m11 m7 %endif - movd m0, [wtq] pshufd m0, m0, 0 DEFINE_ARGS dst, stride, t1, t2, w, h, idx %if ARCH_X86_32 diff --git a/ffmpeg/JNI/dav1d/src/x86/mc.asm b/ffmpeg/JNI/dav1d/src/x86/mc.asm index 773546957..5d769df8d 100644 --- a/ffmpeg/JNI/dav1d/src/x86/mc.asm +++ b/ffmpeg/JNI/dav1d/src/x86/mc.asm @@ -133,18 +133,39 @@ subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 +subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11 +subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7 deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11 blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 +wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +bdct_lb_dw: times 4 db 0 + times 4 db 4 + times 4 db 8 + times 4 db 12 +ALIGN 32 +rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7 +resize_shuf: times 5 db 0 + db 1, 2, 3, 4, 5, 6 + times 5+8 db 7 + +ALIGN 8 wm_420_perm64: dq 0xfedcba9876543210 wm_420_sign: dd 0x01020102, 0x01010101 wm_422_sign: dd 0x80808080, 0x7f7f7f7f wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040 +ALIGN 4 +pb_0123: db 0, 1, 2, 3 +pb_4567: db 4, 5, 6, 7 pw_m128 times 2 dw -128 +pw_m256: times 2 dw -256 +pw_32: times 2 dw 32 pw_34: times 2 dw 34 pw_258: times 2 dw 258 pw_512: times 2 dw 512 @@ -152,10 +173,14 @@ pw_1024: times 2 dw 1024 pw_2048: times 2 dw 2048 pw_6903: times 2 dw 6903 pw_8192: times 2 dw 8192 -pd_2: dd 2 -pd_32: dd 32 -pd_512: dd 512 -pd_32768: dd 32768 +pd_2: dd 2 +pd_32: dd 32 +pd_63: dd 63 +pd_512: dd 512 +pd_32768: dd 32768 +pd_0x3ff: dd 0x3ff +pd_0x4000: dd 0x4000 +pq_0x40000000: dq 0x40000000 %define pb_m64 (wm_sign_avx512+4) %define pb_64 (wm_sign_avx512+8) @@ -218,28 +243,55 @@ cextern mc_warp_filter %endrep %endmacro +%macro SCALED_JMP_TABLE 1-* + %xdefine %1_table (%%table - %2) + %xdefine %%base mangle(private_prefix %+ _%1) +%%table: + %rep %0 - 1 + dw %%base %+ .w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_1024: + %xdefine %1_dy1_table (%%dy_1024 - %2) + %rep %0 - 1 + dw %%base %+ .dy1_w%2 - %%base + %rotate 1 + %endrep + %rotate 1 +%%dy_2048: + %xdefine %1_dy2_table (%%dy_2048 - %2) + %rep %0 - 1 + dw %%base %+ .dy2_w%2 - %%base + %rotate 1 + %endrep +%endmacro + %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put) %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep) %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep) %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX -BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 -BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 -HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 -BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 -BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 +BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128 +BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128 +SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128 +BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32 +BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32 +%if HAVE_AVX512ICL BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128 @@ -249,6 +301,7 @@ BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128 BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128 +%endif ; HAVE_AVX512ICL SECTION .text @@ -1929,19 +1982,22 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 %assign FILTER_SMOOTH (1*15 << 16) | 4*15 %assign FILTER_SHARP (2*15 << 16) | 3*15 +%macro FN 4 ; fn, type, type_h, type_v +cglobal %1_%2 + mov t0d, FILTER_%3 + mov t1d, FILTER_%4 +%ifnidn %1, sharp_smooth ; skip the jump in the last filter + jmp mangle(private_prefix %+ _%1 %+ SUFFIX) +%endif +%endmacro + %if WIN64 DECLARE_REG_TMP 4, 5 %else DECLARE_REG_TMP 7, 8 %endif -%macro PUT_8TAP_FN 3 ; type, type_h, type_v -cglobal put_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX) -%endif -%endmacro + +%define PUT_8TAP_FN FN put_8tap, PUT_8TAP_FN regular, REGULAR, REGULAR PUT_8TAP_FN regular_sharp, REGULAR, SHARP @@ -3859,6 +3915,1853 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3 RET %endmacro +%macro movifprep 2 + %if isprep + mov %1, %2 + %endif +%endmacro + +%macro REMAP_REG 2 + %xdefine r%1 r%2 + %xdefine r%1q r%2q + %xdefine r%1d r%2d +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 + %if isprep + %xdefine r14_save r14 + %assign %%i 14 + %rep 14 + %assign %%j %%i-1 + REMAP_REG %%i, %%j + %assign %%i %%i-1 + %endrep + %endif +%endmacro + +%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 + %if isprep + %assign %%i 1 + %rep 13 + %assign %%j %%i+1 + REMAP_REG %%i, %%j + %assign %%i %%i+1 + %endrep + %xdefine r14 r14_save + %undef r14_save + %endif +%endmacro + +%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged + MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT + RET + %if %1 + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %endif +%endmacro + +%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] + movq xm%1, [srcq+ r4] + movq xm%2, [srcq+ r6] + movhps xm%1, [srcq+ r7] + movhps xm%2, [srcq+ r9] + vinserti128 m%1, [srcq+r10], 1 + vinserti128 m%2, [srcq+r11], 1 + vpbroadcastq m%5, [srcq+r13] + vpbroadcastq m%6, [srcq+ rX] + add srcq, ssq + movq xm%3, [srcq+ r4] + movq xm%4, [srcq+ r6] + movhps xm%3, [srcq+ r7] + movhps xm%4, [srcq+ r9] + vinserti128 m%3, [srcq+r10], 1 + vinserti128 m%4, [srcq+r11], 1 + vpbroadcastq m%7, [srcq+r13] + vpbroadcastq m%8, [srcq+ rX] + add srcq, ssq + vpblendd m%1, m%5, 0xc0 + vpblendd m%2, m%6, 0xc0 + vpblendd m%3, m%7, 0xc0 + vpblendd m%4, m%8, 0xc0 + pmaddubsw m%1, m15 + pmaddubsw m%2, m10 + pmaddubsw m%3, m15 + pmaddubsw m%4, m10 + phaddw m%1, m%2 + phaddw m%3, m%4 + phaddw m%1, m%3 + pmulhrsw m%1, m12 +%endmacro + +%macro MC_8TAP_SCALED 1 +%ifidn %1, put + %assign isprep 0 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy + %else +cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy + %endif + %xdefine base_reg r12 + %define rndshift 10 +%else + %assign isprep 1 + %if required_stack_alignment <= STACK_ALIGNMENT +cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy + %xdefine tmp_stridem r14q + %else +cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy + %define tmp_stridem qword [rsp+104] + %endif + %xdefine base_reg r11 + %define rndshift 6 +%endif + lea base_reg, [%1_8tap_scaled_avx2] +%define base base_reg-%1_8tap_scaled_avx2 + tzcnt wd, wm + vpbroadcastd m8, dxm +%if isprep && UNIX64 + movd xm14, mxd + vpbroadcastd m14, xm14 + mov r5d, t0d + DECLARE_REG_TMP 5, 7 +%else + vpbroadcastd m14, mxm +%endif + mov dyd, dym +%ifidn %1, put + %if WIN64 + mov r8d, hm + DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 + %define hm r5m + %define dxm r8m + %else + DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 + %define hm r6m + %endif + %if required_stack_alignment > STACK_ALIGNMENT + %define dsm [rsp+96] + %define rX r1 + %define rXd r1d + %else + %define dsm dsq + %define rX r14 + %define rXd r14d + %endif +%else ; prep + %if WIN64 + mov r7d, hm + DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 + %define hm r4m + %define dxm r7m + %else + DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 + %define hm [rsp+96] + %endif + MCT_8TAP_SCALED_REMAP_REGS_TO_PREV + %define rX r14 + %define rXd r14d +%endif + vpbroadcastd m10, [base+pd_0x3ff] + vpbroadcastd m12, [base+pw_8192] +%ifidn %1, put + vpbroadcastd m13, [base+pd_512] +%else + vpbroadcastd m13, [base+pd_32] +%endif + pxor m9, m9 + lea ss3q, [ssq*3] + movzx r7d, t1b + shr t1d, 16 + cmp hd, 6 + cmovs t1d, r7d + sub srcq, ss3q + cmp dyd, 1024 + je .dy1 + cmp dyd, 2048 + je .dy2 + movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + vpblendd m15, m7, 0xaa + vpblendd m0, m2, 0xc0 ; 0 1 4 5 + vpblendd m1, m3, 0xc0 ; 2 3 6 7 + pblendvb m15, m11, m8 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7 + vextracti128 xm1, m0, 1 ; 4 5 6 7 + palignr xm2, xm1, xm0, 4 ; 1 2 3 4 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + pshufd xm4, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm4 ; 45 56 + punpckhwd xm4, xm1, xm4 ; 67 __ +.w2_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + pmaddwd xm8, xm4, xm11 + paddd xm5, xm6 + paddd xm7, xm8 + paddd xm5, xm13 + paddd xm5, xm7 + psrad xm5, 10 + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq], xm5, 0 + add dstq, dsq + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w2_loop + movq xm5, [srcq] + test myd, 0x400 + jz .w2_skip_line + add srcq, ssq + shufps xm3, xm0, q1032 ; 01 12 + shufps xm0, xm2, q1032 ; 23 34 + shufps xm2, xm4, q1032 ; 45 56 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 + palignr xm1, xm5, xm1, 12 + punpcklqdq xm1, xm1 ; 6 7 6 7 + punpcklwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +.w2_skip_line: + movhps xm5, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + mova xm3, xm0 ; 01 12 + mova xm0, xm2 ; 23 34 + pshufb xm5, xm14 + pmaddubsw xm5, xm15 + phaddw xm5, xm5 + pmulhrsw xm5, xm12 ; 6 7 6 7 + palignr xm1, xm5, xm1, 8 ; 4 5 6 7 + pshufd xm5, xm1, q0321 ; 5 6 7 _ + punpcklwd xm2, xm1, xm5 ; 45 56 + punpckhwd xm4, xm1, xm5 ; 67 __ + jmp .w2_loop +%endif +.w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m0, m14, m10 + psrld m0, 6 + paddd xm15, xm0 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m0, m9 + psrld m14, 10 + movu xm7, [srcq+ssq*0] + movu xm9, [srcq+ssq*1] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm8, [srcq+ssq*2] + movu xm10, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m7, [srcq+ssq*0], 1 + vinserti128 m9, [srcq+ssq*1], 1 + vinserti128 m15, xm15, 1 + vinserti128 m8, [srcq+ssq*2], 1 + vinserti128 m10, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pblendvb m15, m11, m0 + pshufb m7, m14 + pshufb m9, m14 + pshufb m8, m14 + pshufb m10, m14 + pmaddubsw m7, m15 + pmaddubsw m9, m15 + pmaddubsw m8, m15 + pmaddubsw m10, m15 + phaddw m7, m9 + phaddw m8, m10 + pmulhrsw m7, m12 ; 0 1 4 5 + pmulhrsw m8, m12 ; 2 3 6 7 + vextracti128 xm9, m7, 1 ; 4 5 + vextracti128 xm3, m8, 1 ; 6 7 + shufps xm4, xm7, xm8, q1032 ; 1 2 + shufps xm5, xm8, xm9, q1032 ; 3 4 + shufps xm6, xm9, xm3, q1032 ; 5 6 + psrldq xm11, xm3, 8 ; 7 _ + punpcklwd xm0, xm7, xm4 ; 01 + punpckhwd xm7, xm4 ; 12 + punpcklwd xm1, xm8, xm5 ; 23 + punpckhwd xm8, xm5 ; 34 + punpcklwd xm2, xm9, xm6 ; 45 + punpckhwd xm9, xm6 ; 56 + punpcklwd xm3, xm11 ; 67 + mova [rsp+0x00], xm7 + mova [rsp+0x10], xm8 + mova [rsp+0x20], xm9 +.w4_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm10, r6q + punpcklbw xm10, xm10 + psraw xm10, 8 + pshufd xm7, xm10, q0000 + pshufd xm8, xm10, q1111 + pshufd xm9, xm10, q2222 + pshufd xm10, xm10, q3333 + pmaddwd xm4, xm0, xm7 + pmaddwd xm5, xm1, xm8 + pmaddwd xm6, xm2, xm9 + pmaddwd xm7, xm3, xm10 + paddd xm4, xm5 + paddd xm6, xm7 + paddd xm4, xm13 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq], xm4 + add dstq, dsq +%else + movq [tmpq], xm4 + add tmpq, 8 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w4_loop + movu xm4, [srcq] + test myd, 0x400 + jz .w4_skip_line + mova xm0, [rsp+0x00] + mova [rsp+0x00], xm1 + mova xm1, [rsp+0x10] + mova [rsp+0x10], xm2 + mova xm2, [rsp+0x20] + mova [rsp+0x20], xm3 + pshufb xm4, xm14 + pmaddubsw xm4, xm15 + phaddw xm4, xm4 + pmulhrsw xm4, xm12 + punpcklwd xm3, xm11, xm4 + mova xm11, xm4 + add srcq, ssq + jmp .w4_loop +.w4_skip_line: + movu xm5, [srcq+ssq*1] + movu m6, [rsp+0x10] + pshufb xm4, xm14 + pshufb xm5, xm14 + pmaddubsw xm4, xm15 + pmaddubsw xm5, xm15 + movu [rsp+0x00], m6 + phaddw xm4, xm5 + pmulhrsw xm4, xm12 + punpcklwd xm9, xm11, xm4 + mova [rsp+0x20], xm9 + psrldq xm11, xm4, 8 + mova xm0, xm1 + mova xm1, xm2 + mova xm2, xm3 + punpcklwd xm3, xm4, xm11 + lea srcq, [srcq+ssq*2] + jmp .w4_loop +.w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+r4*8] + movq xm10, [base+subpel_filters+r6*8] + movhps xm15, [base+subpel_filters+r7*8] + movhps xm10, [base+subpel_filters+r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+rX*8] + psrld m14, 10 + mova [rsp], xm14 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.w8_loop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + add myd, dyd + test myd, ~0x3ff + jz .w8_loop + test myd, 0x400 + mov [rsp+16], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .w8_skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .w8_loop +.w8_skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+16] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .w8_loop +.w16: + mov dword [rsp+48], 2 + movifprep tmp_stridem, 32 + jmp .w_start +.w32: + mov dword [rsp+48], 4 + movifprep tmp_stridem, 64 + jmp .w_start +.w64: + mov dword [rsp+48], 8 + movifprep tmp_stridem, 128 + jmp .w_start +.w128: + mov dword [rsp+48], 16 + movifprep tmp_stridem, 256 +.w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+72], t0d + mov [rsp+56], srcq + mov [rsp+64], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .hloop +.hloop_prep: + dec dword [rsp+48] + jz .ret + add qword [rsp+64], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+16] + vpbroadcastd m15, [rsp+72] + pxor m9, m9 + mov srcq, [rsp+56] + mov r0q, [rsp+64] ; dstq / tmpq +.hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+16], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + mova [rsp], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + mov dyd, dym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + vbroadcasti128 m14, [base+wswap] +.vloop: + and myd, 0x3ff + mov r6d, 64 << 24 + mov r4d, myd + shr r4d, 6 + lea r4d, [t1+r4] + cmovnz r6q, [base+subpel_filters+r4*8] + movq xm11, r6q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufd m8, m11, q2222 + pshufd m11, m11, q3333 + pmaddwd m6, m2, m8 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .hloop_prep + add myd, dyd + test myd, ~0x3ff + jz .vloop + test myd, 0x400 + mov [rsp+52], myd + mov r4d, [rsp+ 0] + mov r6d, [rsp+ 8] + mov r7d, [rsp+ 4] + mov r9d, [rsp+12] + jz .skip_line + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .vloop +.skip_line: + mova m0, m1 + mova m1, m2 + mova m2, m3 + vpbroadcastq m7, [srcq+r13] + vpbroadcastq m8, [srcq+ rX] + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + add srcq, ssq + movq xm5, [srcq+ r4] + movq xm6, [srcq+ r6] + movhps xm5, [srcq+ r7] + movhps xm6, [srcq+ r9] + vinserti128 m5, [srcq+r10], 1 + vinserti128 m6, [srcq+r11], 1 + vpbroadcastq m9, [srcq+r13] + vpbroadcastq m11, [srcq+ rX] + add srcq, ssq + mov myd, [rsp+52] + mov dyd, dym + vpblendd m3, m7, 0xc0 + vpblendd m4, m8, 0xc0 + vpblendd m5, m9, 0xc0 + vpblendd m6, m11, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + pmaddubsw m5, m15 + pmaddubsw m6, m10 + phaddw m3, m4 + phaddw m5, m6 + psrld m4, m3, 16 + pslld m6, m5, 16 + paddw m3, m4 + paddw m5, m6 + pblendw m3, m5, 0xaa + pmulhrsw m3, m12 + jmp .vloop +.dy1: + movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy1_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + movq xm1, [srcq+ssq*2] + movhps xm0, [srcq+ssq*1] + movhps xm1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + vinserti128 m0, [srcq+ssq*0], 1 + vinserti128 m1, [srcq+ssq*2], 1 + vpbroadcastq m2, [srcq+ssq*1] + add srcq, ss3q + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + pshufd xm8, xm10, q0000 + pshufd xm9, xm10, q1111 + pshufd xm11, xm10, q3333 + pshufd xm10, xm10, q2222 + vpblendd m0, m2, 0xc0 + pshufb m1, m14 + pshufb m0, m14 + pmaddubsw m1, m15 + pmaddubsw m0, m15 + phaddw m0, m1 + pmulhrsw m0, m12 + vextracti128 xm1, m0, 1 + palignr xm2, xm1, xm0, 4 + pshufd xm4, xm1, q2121 + punpcklwd xm3, xm0, xm2 ; 01 12 + punpckhwd xm0, xm2 ; 23 34 + punpcklwd xm2, xm1, xm4 ; 45 56 +.dy1_w2_loop: + movq xm1, [srcq+ssq*0] + movhps xm1, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + pmaddwd xm5, xm3, xm8 + pmaddwd xm6, xm0, xm9 + pmaddwd xm7, xm2, xm10 + mova xm3, xm0 + mova xm0, xm2 + paddd xm5, xm13 + paddd xm6, xm7 + pshufb xm1, xm14 + pmaddubsw xm1, xm15 + phaddw xm1, xm1 + pmulhrsw xm1, xm12 + palignr xm7, xm1, xm4, 12 + punpcklwd xm2, xm7, xm1 ; 67 78 + pmaddwd xm7, xm2, xm11 + mova xm4, xm1 + paddd xm5, xm6 + paddd xm5, xm7 + psrad xm5, rndshift + packssdw xm5, xm5 + packuswb xm5, xm5 + pextrw [dstq+dsq*0], xm5, 0 + pextrw [dstq+dsq*1], xm5, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy1_w2_loop + RET +%endif +.dy1_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + vpermq m8, m8, q3120 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r11d, xm15, 1 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + movu xm2, [srcq+ssq*0] + movu xm3, [srcq+ssq*2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pcmpeqd m8, m9 + psrld m14, 10 + pinsrd xm15, [base+subpel_filters+r11*8+2], 1 + vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20 + vinserti128 m2, [srcq+ssq*1], 1 + vinserti128 m3, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + pshufb m14, m5 + paddb m14, m6 + movu xm4, [srcq+ssq*0] + movu xm5, [srcq+ssq*2] + vinserti128 m4, [srcq+ssq*1], 1 + add srcq, ss3q + vpblendd m15, m7, 0x30 + punpcklqdq m15, m15 + pblendvb m15, m11, m8 + movq xm10, r4q + punpcklbw xm10, xm10 + psraw xm10, 8 + vinserti128 m10, xm10, 1 + pshufb m2, m14 + pshufb m3, m14 + pshufb m4, m14 + pshufb xm5, xm14 + vpermq m2, m2, q3120 + vpermq m3, m3, q3120 + vpermq m4, m4, q3120 + vpermq m5, m5, q3120 + pshufd m7, m10, q0000 + pshufd m8, m10, q1111 + pshufd m9, m10, q2222 + pshufd m10, m10, q3333 + pmaddubsw m2, m15 + pmaddubsw m3, m15 + pmaddubsw m4, m15 + pmaddubsw m5, m15 + phaddw m2, m3 + phaddw m4, m5 + pmulhrsw m2, m12 + pmulhrsw m4, m12 + palignr m5, m4, m2, 4 + pshufd m3, m4, q2121 + punpcklwd m0, m2, m5 ; 01 12 + punpckhwd m1, m2, m5 ; 23 34 + punpcklwd m2, m4, m3 ; 45 56 +.dy1_w4_loop: + movu xm11, [srcq+ssq*0] + vinserti128 m11, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pmaddwd m4, m0, m7 + pmaddwd m5, m1, m8 + pmaddwd m6, m2, m9 + mova m0, m1 + mova m1, m2 + paddd m4, m13 + paddd m5, m6 + pshufb m11, m14 + vpermq m11, m11, q3120 + pmaddubsw m11, m15 + phaddw m11, m11 + pmulhrsw m11, m12 + palignr m6, m11, m3, 12 + punpcklwd m2, m6, m11 ; 67 78 + mova m3, m11 + pmaddwd m6, m2, m10 + paddd m4, m5 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + pshuflw xm4, xm4, q3120 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + pshufd xm4, xm4, q3120 + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy1_w4_loop + MC_8TAP_SCALED_RET +.dy1_w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp+32], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp+32] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_w8_loop +.dy1_w16: + mov dword [rsp+72], 2 + movifprep tmp_stridem, 32 + jmp .dy1_w_start +.dy1_w32: + mov dword [rsp+72], 4 + movifprep tmp_stridem, 64 + jmp .dy1_w_start +.dy1_w64: + mov dword [rsp+72], 8 + movifprep tmp_stridem, 128 + jmp .dy1_w_start +.dy1_w128: + mov dword [rsp+72], 16 + movifprep tmp_stridem, 256 +.dy1_w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+76], t0d + mov [rsp+80], srcq + mov [rsp+88], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy1_hloop +.dy1_hloop_prep: + dec dword [rsp+72] + jz .ret + add qword [rsp+88], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp+32] + vpbroadcastd m15, [rsp+76] + pxor m9, m9 + mov srcq, [rsp+80] + mov r0q, [rsp+88] ; dstq / tmpq +.dy1_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp+32], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+64], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + movu [rsp], m10 + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vbroadcasti128 m14, [base+wswap] + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r4d, [rsp+64] + mov r7d, [rsp+68] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 +.dy1_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m10 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy1_hloop_prep + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + pshufb m0, m14 + pshufb m1, m14 + pshufb m2, m14 + pshufb m3, m14 + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, [rsp] + phaddw m4, m5 + pslld m5, m4, 16 + paddw m4, m5 + pmulhrsw m4, m12 + pblendw m0, m1, 0xaa + pblendw m1, m2, 0xaa + pblendw m2, m3, 0xaa + pblendw m3, m4, 0xaa + jmp .dy1_vloop +.dy2: + movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] + add wq, base_reg + jmp wq +%ifidn %1, put +.dy2_w2: + mov myd, mym + movzx t0d, t0b + dec srcq + movd xm15, t0d + punpckldq m8, m9, m8 + paddd m14, m8 ; mx+dx*[0-1] + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + vbroadcasti128 m5, [base+bdct_lb_dw] + vbroadcasti128 m6, [base+subpel_s_shuf2] + vpbroadcastd m15, [base+subpel_filters+r4*8+2] + vpbroadcastd m7, [base+subpel_filters+r6*8+2] + pcmpeqd m8, m9 + psrld m14, 10 + movq xm0, [srcq+ssq*0] + vpbroadcastq m2, [srcq+ssq*1] + movhps xm0, [srcq+ssq*2] + vpbroadcastq m3, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pshufb m14, m5 + paddb m14, m6 + vpblendd m15, m7, 0xaa + pblendvb m15, m11, m8 + movhps xm1, [srcq+ssq*0] + vpbroadcastq m4, [srcq+ssq*1] + lea srcq, [srcq+ssq*2] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vpblendd m0, m2, 0x30 + vpblendd m1, m4, 0xc0 + vpblendd m0, m3, 0xc0 + pshufb m0, m14 + pshufb m1, m14 + pmaddubsw m0, m15 + pmaddubsw m1, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + phaddw m0, m1 + pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5 + pshufd xm8, xm11, q0000 + pshufd xm9, xm11, q1111 + pshufd xm10, xm11, q2222 + pshufd xm11, xm11, q3333 + pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 01 23 + punpckhwd xm2, xm1 ; 23 45 +.dy2_w2_loop: + movq xm6, [srcq+ssq*0] + vpbroadcastq m7, [srcq+ssq*1] + movhps xm6, [srcq+ssq*2] + vpbroadcastq m1, [srcq+ss3q ] + lea srcq, [srcq+ssq*4] + pmaddwd xm4, xm3, xm8 + pmaddwd xm5, xm2, xm9 + vpblendd m6, m7, 0x30 + vpblendd m6, m1, 0xc0 + pshufb m6, m14 + pmaddubsw m6, m15 + phaddw m6, m6 + pmulhrsw m6, m12 + palignr m0, m6, m0, 8 + pshufd m2, m0, q3221 + vextracti128 xm1, m2, 1 + punpcklwd xm3, xm2, xm1 ; 45 67 + punpckhwd xm2, xm1 ; 67 89 + pmaddwd xm6, xm3, xm10 + pmaddwd xm7, xm2, xm11 + paddd xm4, xm5 + paddd xm4, xm13 + paddd xm6, xm7 + paddd xm4, xm6 + psrad xm4, rndshift + packssdw xm4, xm4 + packuswb xm4, xm4 + pextrw [dstq+dsq*0], xm4, 0 + pextrw [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] + sub hd, 2 + jg .dy2_w2_loop + RET +%endif +.dy2_w4: + mov myd, mym + vbroadcasti128 m7, [base+rescale_mul] + movzx t0d, t0b + dec srcq + movd xm15, t0d + pmaddwd m8, m7 + vpbroadcastd m11, [base+pd_0x4000] + vpbroadcastd xm15, xm15 + paddd m14, m8 ; mx+dx*[0-3] + pand m8, m14, m10 + psrld m8, 6 + paddd xm15, xm8 + movd r4d, xm15 + pextrd r6d, xm15, 1 + pextrd r11d, xm15, 2 + pextrd r13d, xm15, 3 + movd xm15, [base+subpel_filters+r4*8+2] + vbroadcasti128 m5, [base+bdct_lb_dw] + vpbroadcastq m6, [base+subpel_s_shuf2] + pinsrd xm15, [base+subpel_filters+r6*8+2], 1 + pcmpeqd m8, m9 + psrld m14, 10 + movu xm0, [srcq+ssq*0] + movu xm2, [srcq+ssq*2] + pinsrd xm15, [base+subpel_filters+r11*8+2], 2 + movu xm1, [srcq+ssq*1] + movu xm3, [srcq+ss3q ] + pinsrd xm15, [base+subpel_filters+r13*8+2], 3 + lea srcq, [srcq+ssq*4] + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + vinserti128 m15, xm15, 1 + pshufb m14, m5 + paddb m14, m6 + vinserti128 m2, [srcq+ssq*0], 1 + vinserti128 m3, [srcq+ssq*1], 1 + lea srcq, [srcq+ssq*2] + pblendvb m15, m11, m8 + pshufb xm0, xm14 + pshufb m2, m14 + pshufb xm1, xm14 + pshufb m3, m14 + pmaddubsw xm0, xm15 + pmaddubsw m2, m15 + pmaddubsw xm1, xm15 + pmaddubsw m3, m15 + movq xm11, r4q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + phaddw m0, m2 + phaddw m1, m3 + pmulhrsw m0, m12 ; 0 2 _ 4 + pmulhrsw m1, m12 ; 1 3 _ 5 + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m10, m11, q2222 + pshufd m11, m11, q3333 + punpcklwd xm2, xm0, xm1 + punpckhwd m1, m0, m1 ; 23 45 + vinserti128 m0, m2, xm1, 1 ; 01 23 +.dy2_w4_loop: + movu xm6, [srcq+ssq*0] + movu xm7, [srcq+ssq*1] + vinserti128 m6, [srcq+ssq*2], 1 + vinserti128 m7, [srcq+ss3q ], 1 + lea srcq, [srcq+ssq*4] + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pshufb m6, m14 + pshufb m7, m14 + pmaddubsw m6, m15 + pmaddubsw m7, m15 + psrld m2, m6, 16 + pslld m3, m7, 16 + paddw m6, m2 + paddw m7, m3 + pblendw m6, m7, 0xaa ; 67 89 + pmulhrsw m6, m12 + paddd m4, m5 + vpblendd m0, m1, m6, 0x0f + mova m1, m6 + vpermq m0, m0, q1032 ; 45 67 + pmaddwd m6, m0, m10 + pmaddwd m7, m1, m11 + paddd m4, m13 + paddd m6, m7 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movd [dstq+dsq*0], xm4 + pextrd [dstq+dsq*1], xm4, 1 + lea dstq, [dstq+dsq*2] +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + sub hd, 2 + jg .dy2_w4_loop + MC_8TAP_SCALED_RET +.dy2_w8: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + movd xm15, t0d + pmaddwd m8, [base+rescale_mul] + vpbroadcastq m11, [base+pq_0x40000000] + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + mov [rsp], r7d + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + lea myd, [t1+myq] + mov t1d, 64 << 24 + cmovnz t1q, [base+subpel_filters+myq*8] + movq xm11, t1q + punpcklbw xm11, xm11 + psraw xm11, 8 + vinserti128 m11, xm11, 1 + mov r7d, [rsp] + pshufd m8, m11, q0000 + pshufd m9, m11, q1111 + pshufd m14, m11, q2222 + pshufd m11, m11, q3333 +.dy2_w8_loop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m14 + pmaddwd m7, m3, m11 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, 16 +%endif + dec hd + jz .ret + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_w8_loop +.dy2_w16: + mov dword [rsp+40], 2 + movifprep tmp_stridem, 32 + jmp .dy2_w_start +.dy2_w32: + mov dword [rsp+40], 4 + movifprep tmp_stridem, 64 + jmp .dy2_w_start +.dy2_w64: + mov dword [rsp+40], 8 + movifprep tmp_stridem, 128 + jmp .dy2_w_start +.dy2_w128: + mov dword [rsp+40], 16 + movifprep tmp_stridem, 256 +.dy2_w_start: +%ifidn %1, put + movifnidn dsm, dsq +%endif + shr t0d, 16 + sub srcq, 3 + pmaddwd m8, [base+rescale_mul] + movd xm15, t0d + mov [rsp+64], t0d + mov [rsp+48], srcq + mov [rsp+56], r0q ; dstq / tmpq +%if UNIX64 + mov hm, hd +%endif + shl dword dxm, 3 ; dx*8 + vpbroadcastd m15, xm15 + paddd m14, m8 ; mx+dx*[0-7] + jmp .dy2_hloop +.dy2_hloop_prep: + dec dword [rsp+40] + jz .ret + add qword [rsp+56], 8*(isprep+1) + mov hd, hm + vpbroadcastd m8, dxm + vpbroadcastd m10, [base+pd_0x3ff] + paddd m14, m8, [rsp] + vpbroadcastd m15, [rsp+64] + pxor m9, m9 + mov srcq, [rsp+48] + mov r0q, [rsp+56] ; dstq / tmpq +.dy2_hloop: + vpbroadcastq m11, [base+pq_0x40000000] + pand m6, m14, m10 + psrld m6, 6 + paddd m15, m6 + pcmpeqd m6, m9 + vextracti128 xm7, m15, 1 + movd r4d, xm15 + pextrd r6d, xm15, 2 + pextrd r7d, xm15, 1 + pextrd r9d, xm15, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + movu [rsp], m14 + movq xm15, [base+subpel_filters+ r4*8] + movq xm10, [base+subpel_filters+ r6*8] + movhps xm15, [base+subpel_filters+ r7*8] + movhps xm10, [base+subpel_filters+ r9*8] + vinserti128 m15, [base+subpel_filters+r10*8], 1 + vinserti128 m10, [base+subpel_filters+r11*8], 1 + vpbroadcastq m9, [base+subpel_filters+r13*8] + vpbroadcastq m8, [base+subpel_filters+ rX*8] + psrld m14, 10 + vextracti128 xm7, m14, 1 + movq [rsp+32], xm14 + movd r4d, xm14 + pextrd r6d, xm14, 2 + pextrd r7d, xm14, 1 + pextrd r9d, xm14, 3 + movd r10d, xm7 + pextrd r11d, xm7, 2 + pextrd r13d, xm7, 1 + pextrd rXd, xm7, 3 + pshufd m5, m6, q1100 + pshufd m6, m6, q3322 + vpblendd m15, m9, 0xc0 + vpblendd m10, m8, 0xc0 + pblendvb m15, m11, m5 + pblendvb m10, m11, m6 + vbroadcasti128 m14, [base+subpel_s_shuf8] + MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b + MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b + MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b + MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b + mov myd, mym + pshufb m0, m14 ; 01a 01b + pshufb m1, m14 ; 23a 23b + pshufb m2, m14 ; 45a 45b + pshufb m3, m14 ; 67a 67b + shr myd, 6 + mov r4d, 64 << 24 + lea myd, [t1+myq] + cmovnz r4q, [base+subpel_filters+myq*8] + movq xm14, r4q + punpcklbw xm14, xm14 + psraw xm14, 8 + vinserti128 m14, xm14, 1 + mov r4d, [rsp+32] + mov r7d, [rsp+36] + pshufd m8, m14, q0000 + pshufd m9, m14, q1111 + pshufd m11, m14, q2222 + pshufd m14, m14, q3333 +.dy2_vloop: + pmaddwd m4, m0, m8 + pmaddwd m5, m1, m9 + pmaddwd m6, m2, m11 + pmaddwd m7, m3, m14 + paddd m4, m5 + paddd m6, m7 + paddd m4, m13 + paddd m4, m6 + psrad m4, rndshift + vextracti128 xm5, m4, 1 + packssdw xm4, xm5 +%ifidn %1, put + packuswb xm4, xm4 + movq [dstq], xm4 + add dstq, dsm +%else + mova [tmpq], xm4 + add tmpq, tmp_stridem +%endif + dec hd + jz .dy2_hloop_prep + mova m0, m1 + mova m1, m2 + mova m2, m3 + movq xm3, [srcq+ r4] + movq xm4, [srcq+ r6] + movhps xm3, [srcq+ r7] + movhps xm4, [srcq+ r9] + vinserti128 m3, [srcq+r10], 1 + vinserti128 m4, [srcq+r11], 1 + vpbroadcastq m5, [srcq+r13] + vpbroadcastq m6, [srcq+ rX] + add srcq, ssq + vpblendd m3, m5, 0xc0 + vpblendd m4, m6, 0xc0 + pmaddubsw m3, m15 + pmaddubsw m4, m10 + phaddw m3, m4 + movq xm4, [srcq+ r4] + movq xm5, [srcq+ r6] + movhps xm4, [srcq+ r7] + movhps xm5, [srcq+ r9] + vinserti128 m4, [srcq+r10], 1 + vinserti128 m5, [srcq+r11], 1 + vpbroadcastq m6, [srcq+r13] + vpbroadcastq m7, [srcq+ rX] + add srcq, ssq + vpblendd m4, m6, 0xc0 + vpblendd m5, m7, 0xc0 + pmaddubsw m4, m15 + pmaddubsw m5, m10 + phaddw m4, m5 + psrld m5, m3, 16 + pslld m6, m4, 16 + paddw m3, m5 + paddw m4, m6 + pblendw m3, m4, 0xaa + pmulhrsw m3, m12 + jmp .dy2_vloop +.ret: + MC_8TAP_SCALED_RET 0 +%undef isprep +%endmacro + +%macro BILIN_SCALED_FN 1 +cglobal %1_bilin_scaled + mov t0d, (5*15 << 16) | 5*15 + mov t1d, (5*15 << 16) | 5*15 + jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX) +%endmacro +%define PUT_8TAP_SCALED_FN FN put_8tap_scaled, +%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled, + +%if WIN64 +DECLARE_REG_TMP 6, 5 +%else +DECLARE_REG_TMP 6, 8 +%endif +BILIN_SCALED_FN put +PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR +PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PUT_8TAP_SCALED_FN sharp, SHARP, SHARP +PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +MC_8TAP_SCALED put + +%if WIN64 +DECLARE_REG_TMP 5, 4 +%else +DECLARE_REG_TMP 6, 7 +%endif +BILIN_SCALED_FN prep +PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR +PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_SCALED_FN sharp, SHARP, SHARP +PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH +MC_8TAP_SCALED prep + %macro WARP_V 5 ; dst, 02, 46, 13, 57 ; Can be done using gathers, but that's terribly slow on many CPU:s lea tmp1d, [myq+deltaq*4] @@ -4869,9 +6772,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: %if %1 - test leftextq, leftextq - jz .body_%3 - ; left extension xor r3, r3 vpbroadcastb m0, [srcq] @@ -4882,7 +6782,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ jl .left_loop_%3 ; body -.body_%3: lea r12, [dstq+leftextq] %endif xor r3, r3 @@ -4899,8 +6798,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ %if %2 ; right extension - test rightextq, rightextq - jz .body_loop_end_%3 %if %1 add r12, centerwq %else @@ -4914,7 +6811,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ cmp r3, rightextq jl .right_loop_%3 -.body_loop_end_%3: %endif add dstq, dstrideq add srcq, sstrideq @@ -4985,6 +6881,147 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \ .end: RET +cextern resize_filter + +INIT_YMM avx2 +cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + vpbroadcastd m5, dxm + vpbroadcastd m8, mx0m + vpbroadcastd m6, src_wm + + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + LEA r7, $$ +%define base r7-$$ + + vpbroadcastd m3, [base+pw_m256] + vpbroadcastd m7, [base+pd_63] + vbroadcasti128 m15, [base+pb_8x0_8x8] + pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7] + pslld m5, 3 ; dx*8 + pslld m6, 14 + paddd m8, m2 ; mx+[0..7]*dx + pxor m2, m2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m4, m8 ; per-line working version of mx + +.loop_x: + pmaxsd m0, m4, m2 + psrad m9, m4, 8 ; filter offset (unmasked) + pminsd m0, m6 ; iclip(mx, 0, src_w-8) + psubd m1, m4, m0 ; pshufb offset + psrad m0, 14 ; clipped src_x offset + psrad m1, 14 ; pshufb edge_emu offset + pand m9, m7 ; filter offset (masked) + + ; load source pixels - this ugly code is vpgatherdq emulation since + ; directly using vpgatherdq on Haswell is quite a bit slower :( + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vextracti128 xm0, m0, 1 + movq xm12, [srcq+r8] + movq xm13, [srcq+r10] + movhps xm12, [srcq+r9] + movhps xm13, [srcq+r11] + movd r8d, xm0 + pextrd r9d, xm0, 1 + pextrd r10d, xm0, 2 + pextrd r11d, xm0, 3 + vinserti128 m12, [srcq+r8], 1 + vinserti128 m13, [srcq+r10], 1 + vpbroadcastq m10, [srcq+r9] + vpbroadcastq m11, [srcq+r11] + vpblendd m12, m12, m10, 11000000b + vpblendd m13, m13, m11, 11000000b + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + vptest m1, m1 + jz .filter + + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vextracti128 xm1, m1, 1 + movq xm14, [base+resize_shuf+4+r8] + movq xm0, [base+resize_shuf+4+r10] + movhps xm14, [base+resize_shuf+4+r9] + movhps xm0, [base+resize_shuf+4+r11] + movd r8d, xm1 + pextrd r9d, xm1, 1 + pextrd r10d, xm1, 2 + pextrd r11d, xm1, 3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + vinserti128 m14, [base+resize_shuf+4+r8], 1 + vinserti128 m0, [base+resize_shuf+4+r10], 1 + vpbroadcastq m10, [base+resize_shuf+4+r9] + vpbroadcastq m11, [base+resize_shuf+4+r11] + vpblendd m14, m14, m10, 11000000b + vpblendd m0, m0, m11, 11000000b + + paddb m14, m15 + paddb m0, m15 + pshufb m12, m14 + pshufb m13, m0 + +.filter: + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vextracti128 xm9, m9, 1 + movq xm10, [base+resize_filter+r8*8] + movq xm11, [base+resize_filter+r10*8] + movhps xm10, [base+resize_filter+r9*8] + movhps xm11, [base+resize_filter+r11*8] + movd r8d, xm9 + pextrd r9d, xm9, 1 + pextrd r10d, xm9, 2 + pextrd r11d, xm9, 3 + vinserti128 m10, [base+resize_filter+r8*8], 1 + vinserti128 m11, [base+resize_filter+r10*8], 1 + vpbroadcastq m14, [base+resize_filter+r9*8] + vpbroadcastq m1, [base+resize_filter+r11*8] + vpblendd m10, m10, m14, 11000000b + vpblendd m11, m11, m1, 11000000b + + pmaddubsw m12, m10 + pmaddubsw m13, m11 + phaddw m12, m13 + vextracti128 xm13, m12, 1 + phaddsw xm12, xm13 + pmulhrsw xm12, xm3 ; x=(x+64)>>7 + packuswb xm12, xm12 + movq [dstq+xq], xm12 + + paddd m4, m5 + add xd, 8 + cmp xd, dst_wd + jl .loop_x + + add dstq, dst_strideq + add srcq, src_strideq + dec hd + jg .loop_y + RET + INIT_YMM avx2 PREP_BILIN PREP_8TAP @@ -5501,6 +7538,7 @@ cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3 jg .w128_loop RET +%if HAVE_AVX512ICL INIT_ZMM avx512icl PREP_BILIN PREP_8TAP @@ -6023,4 +8061,6 @@ cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3 jg .w128_loop RET +%endif ; HAVE_AVX512ICL + %endif ; ARCH_X86_64 diff --git a/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c index 8e2f6a0ba..a01ac14ab 100644 --- a/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c +++ b/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c @@ -52,33 +52,65 @@ decl_mc_fn(dav1d_put_bilin_ssse3); decl_mct_fn(dav1d_prep_8tap_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_avx2); decl_mct_fn(dav1d_prep_8tap_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sse2); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2); decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2); decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3); +decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2); decl_mct_fn(dav1d_prep_bilin_avx512icl); decl_mct_fn(dav1d_prep_bilin_avx2); decl_mct_fn(dav1d_prep_bilin_ssse3); +decl_mct_fn(dav1d_prep_bilin_sse2); + +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2); +decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2); +decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2); + +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2); +decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2); +decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2); decl_avg_fn(dav1d_avg_avx512icl); decl_avg_fn(dav1d_avg_avx2); @@ -115,17 +147,36 @@ decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2); decl_emu_edge_fn(dav1d_emu_edge_avx2); decl_emu_edge_fn(dav1d_emu_edge_ssse3); +decl_resize_fn(dav1d_resize_avx2); +decl_resize_fn(dav1d_resize_ssse3); + COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { #define init_mc_fn(type, name, suffix) \ c->mc[type] = dav1d_put_##name##_##suffix #define init_mct_fn(type, name, suffix) \ c->mct[type] = dav1d_prep_##name##_##suffix +#define init_mc_scaled_fn(type, name, suffix) \ + c->mc_scaled[type] = dav1d_put_##name##_##suffix +#define init_mct_scaled_fn(type, name, suffix) \ + c->mct_scaled[type] = dav1d_prep_##name##_##suffix + const unsigned flags = dav1d_get_cpu_flags(); if(!(flags & DAV1D_X86_CPU_FLAG_SSE2)) return; #if BITDEPTH == 8 + init_mct_fn(FILTER_2D_BILINEAR, bilin, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, sse2); + init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, sse2); + c->warp8x8 = dav1d_warp_affine_8x8_sse2; c->warp8x8t = dav1d_warp_affine_8x8t_sse2; #endif @@ -134,16 +185,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_BILINEAR, bilin, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); + init_mc_fn(FILTER_2D_BILINEAR, bilin, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3); init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3); @@ -168,6 +219,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->warp8x8t = dav1d_warp_affine_8x8t_ssse3; c->emu_edge = dav1d_emu_edge_ssse3; + c->resize = dav1d_resize_ssse3; #endif if(!(flags & DAV1D_X86_CPU_FLAG_SSE41)) @@ -183,16 +235,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { return; #if BITDEPTH == 8 - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); - init_mc_fn (FILTER_2D_BILINEAR, bilin, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2); + init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); + init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2); @@ -205,6 +257,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2); init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mc_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2); + init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2); + init_mct_scaled_fn(FILTER_2D_BILINEAR, bilin_scaled, avx2); + c->avg = dav1d_avg_avx2; c->w_avg = dav1d_w_avg_avx2; c->mask = dav1d_mask_avx2; @@ -219,12 +293,13 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) { c->warp8x8t = dav1d_warp_affine_8x8t_avx2; c->emu_edge = dav1d_emu_edge_avx2; + c->resize = dav1d_resize_avx2; #endif if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return; -#if BITDEPTH == 8 +#if HAVE_AVX512ICL && BITDEPTH == 8 init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx512icl); init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl); init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx512icl); diff --git a/ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/mc_sse.asm similarity index 80% rename from ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm rename to ffmpeg/JNI/dav1d/src/x86/mc_sse.asm index d0f0dd30b..d98ac621e 100644 --- a/ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm +++ b/ffmpeg/JNI/dav1d/src/x86/mc_sse.asm @@ -57,7 +57,17 @@ subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11 bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7 +pb_8x0_8x8: times 8 db 0 + times 8 db 8 +resize_mul: dd 0, 1, 2, 3 +resize_shuf: times 5 db 0 + db 1, 2, 3, 4, 5, 6 + times 5+16 db 7 + pb_64: times 16 db 64 +pw_m256: times 8 dw -256 +pw_1: times 8 dw 1 +pw_2: times 8 dw 2 pw_8: times 8 dw 8 pw_26: times 8 dw 26 pw_34: times 8 dw 34 @@ -67,6 +77,7 @@ pw_2048: times 8 dw 2048 pw_6903: times 8 dw 6903 pw_8192: times 8 dw 8192 pd_32: times 4 dd 32 +pd_63: times 4 dd 63 pd_512: times 4 dd 512 pd_16384: times 4 dd 16484 pd_32768: times 4 dd 32768 @@ -108,6 +119,7 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16 %endrep %endmacro +%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep) %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put) %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep) @@ -146,6 +158,8 @@ BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 %endif %endmacro +HV_JMP_TABLE prep, 8tap, sse2, 1, 4, 8, 16, 32, 64, 128 +HV_JMP_TABLE prep, bilin, sse2, 7, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128 HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128 @@ -729,15 +743,79 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak lea t0d, [hq+(7<<16)] jmp .hv_w16gt +%macro PSHUFB_0X1X 1-2 ; dst[, src] + %if cpuflag(ssse3) + pshufb %1, %2 + %else + punpcklbw %1, %1 + psraw %1, 8 + pshufd %1, %1, q0000 + %endif +%endmacro + +%macro PSHUFB_BILIN_H8 2 ; dst, src + %if cpuflag(ssse3) + pshufb %1, %2 + %else + mova %2, %1 + psrldq %1, 1 + punpcklbw %1, %2 + %endif +%endmacro + +%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp + %if cpuflag(ssse3) + pshufb %1, %2 + %else + mova %2, %1 + psrldq %1, 1 + punpckhbw %3, %1, %2 + punpcklbw %1, %2 + punpcklqdq %1, %3 + %endif +%endmacro + +%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero + %if cpuflag(ssse3) + pmaddubsw %1, %2 + %else + %if %5 == 1 + pxor %3, %3 + %endif + punpckhbw %4, %1, %3 + punpcklbw %1, %1, %3 + pmaddwd %4, %2 + pmaddwd %1, %2 + packssdw %1, %4 + %endif +%endmacro + +%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2 + %else + punpckhwd %3, %1, %4 + punpcklwd %1, %4 + pmaddwd %3, %2 + pmaddwd %1, %2 + psrad %3, %5 + psrad %1, %5 + packssdw %1, %3 + %endif +%endmacro + +%macro PREP_BILIN 0 + DECLARE_REG_TMP 3, 5, 6 %if ARCH_X86_32 - %define base t2-prep_ssse3 + %define base t2-prep%+SUFFIX %else %define base 0 %endif + cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movifnidn mxyd, r5m ; mx - LEA t2, prep_ssse3 + LEA t2, prep%+SUFFIX tzcnt wd, wm movifnidn hd, hm test mxyd, mxyd @@ -746,6 +824,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 test mxyd, mxyd jnz .v .prep: +%if notcpuflag(ssse3) + add t2, prep_ssse3 - prep_sse2 + jmp prep_ssse3 +%else movzx wd, word [t2+wq*2+table_offset(prep,)] add wq, t2 lea stride3q, [strideq*3] @@ -815,10 +897,18 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .prep_w16 RET -.prep_w16gt: +.prep_w32: + mov t2d, 1 + jmp .prep_w32_vloop +.prep_w64: + mov t2d, 2 + jmp .prep_w32_vloop +.prep_w128: + mov t2d, 4 +.prep_w32_vloop: mov t1q, srcq - mov r3q, t2q -.prep_w16gt_hloop: + mov r3d, t2d +.prep_w32_hloop: movq m0, [t1q+8*0] movq m1, [t1q+8*1] movq m2, [t1q+8*2] @@ -838,45 +928,49 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mova [tmpq+16*3], m3 add tmpq, 16*4 add t1q, 32 - sub r3q, 1 - jg .prep_w16gt_hloop + dec r3d + jg .prep_w32_hloop lea srcq, [srcq+strideq] - sub hd, 1 - jg .prep_w16gt + dec hd + jg .prep_w32_vloop RET -.prep_w32: - mov t2q, 1 - jmp .prep_w16gt -.prep_w64: - mov t2q, 2 - jmp .prep_w16gt -.prep_w128: - mov t2q, 4 - jmp .prep_w16gt +%endif .h: ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 0xff01 +%if cpuflag(ssse3) mova m4, [base+bilin_h_shuf8] +%endif add mxyd, 16 << 8 - movd xm5, mxyd + movd m5, mxyd mov mxyd, r6m ; my +%if cpuflag(ssse3) pshuflw m5, m5, q0000 punpcklqdq m5, m5 +%else + PSHUFB_0X1X m5 +%endif test mxyd, mxyd jnz .hv %if ARCH_X86_32 mov t1, t2 ; save base reg for w4 %endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)] +%if notcpuflag(ssse3) + WIN64_SPILL_XMM 8 + pxor m6, m6 +%endif add wq, t2 lea stride3q, [strideq*3] jmp wq .h_w4: -%if ARCH_X86_32 +%if cpuflag(ssse3) + %if ARCH_X86_32 mova m4, [t1-prep_ssse3+bilin_h_shuf4] -%else + %else mova m4, [bilin_h_shuf4] + %endif %endif .h_w4_loop: movq m0, [srcq+strideq*0] @@ -884,10 +978,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m1, [srcq+strideq*2] movhps m1, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pmaddubsw m0, m5 - pshufb m1, m4 - pmaddubsw m1, m5 + PSHUFB_BILIN_H4 m0, m4, m2 + PMADDUBSW m0, m5, m6, m2, 0 + PSHUFB_BILIN_H4 m1, m4, m2 + PMADDUBSW m1, m5, m6, m2, 0 mova [tmpq+0 ], m0 mova [tmpq+16], m1 add tmpq, 32 @@ -900,14 +994,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*2] movu m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -922,14 +1016,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*1+8*0] movu m3, [srcq+strideq*1+8*1] lea srcq, [srcq+strideq*2] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 @@ -938,52 +1032,60 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 sub hd, 2 jg .h_w16 RET -.h_w16gt: +.h_w32: + mov t2d, 1 << 0 + jmp .h_w32_vloop +.h_w64: + mov t2d, 1 << 1 + jmp .h_w32_vloop +.h_w128: + mov t2d, 1 << 3 +.h_w32_vloop: mov t1q, srcq - mov r3q, t2q -.h_w16gt_hloop: + mov r3d, t2d +.h_w32_hloop: movu m0, [t1q+8*0] movu m1, [t1q+8*1] movu m2, [t1q+8*2] movu m3, [t1q+8*3] - pshufb m0, m4 - pshufb m1, m4 - pshufb m2, m4 - pshufb m3, m4 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - pmaddubsw m2, m5 - pmaddubsw m3, m5 + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m0, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 mova [tmpq+16*2], m2 mova [tmpq+16*3], m3 add tmpq, 16*4 add t1q, 32 - sub r3q, 1 - jg .h_w16gt_hloop + shr r3d, 1 + jnz .h_w32_hloop lea srcq, [srcq+strideq] sub hd, 1 - jg .h_w16gt + jg .h_w32_vloop RET -.h_w32: - mov t2q, 1 - jmp .h_w16gt -.h_w64: - mov t2q, 2 - jmp .h_w16gt -.h_w128: - mov t2q, 4 - jmp .h_w16gt .v: +%if notcpuflag(ssse3) + %assign stack_offset stack_offset - stack_size_padded + WIN64_SPILL_XMM 8 +%endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 0xff01 add mxyd, 16 << 8 add wq, t2 lea stride3q, [strideq*3] movd m5, mxyd +%if cpuflag(ssse3) pshuflw m5, m5, q0000 punpcklqdq m5, m5 +%else + PSHUFB_0X1X m5 + pxor m6, m6 +%endif jmp wq .v_w4: movd m0, [srcq+strideq*0] @@ -995,14 +1097,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 punpcklwd m0, m1 ; 0 1 _ _ punpcklwd m1, m2 ; 1 2 _ _ punpcklbw m1, m0 - pmaddubsw m1, m5 + PMADDUBSW m1, m5, m6, m7, 0 pshufd m1, m1, q3120 mova [tmpq+16*0], m1 movd m0, [srcq+strideq*0] punpcklwd m2, m3 ; 2 3 _ _ punpcklwd m3, m0 ; 3 4 _ _ punpcklbw m3, m2 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 pshufd m3, m3, q3120 mova [tmpq+16*1], m3 add tmpq, 32 @@ -1016,20 +1118,20 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movq m2, [srcq+strideq*1] movq m3, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - shufpd m4, m0, m1, 0x0c ; 0 2 + shufpd m4, m0, m1, 0x0c ; 0 2 movq m0, [srcq+strideq*0] - shufpd m2, m3, 0x0c ; 1 3 - shufpd m1, m0, 0x0c ; 2 4 + shufpd m2, m3, 0x0c ; 1 3 + shufpd m1, m0, 0x0c ; 2 4 punpcklbw m3, m2, m4 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*0], m3 punpckhbw m3, m2, m4 - pmaddubsw m3, m5 + PMADDUBSW m3, m5, m6, m7, 0 mova [tmpq+16*2], m3 punpcklbw m3, m1, m2 punpckhbw m1, m2 - pmaddubsw m3, m5 - pmaddubsw m1, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m1, m5, m6, m7, 0 mova [tmpq+16*1], m3 mova [tmpq+16*3], m1 add tmpq, 16*4 @@ -1043,14 +1145,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 movu m2, [srcq+strideq*2] punpcklbw m3, m1, m0 punpckhbw m4, m1, m0 - pmaddubsw m3, m5 - pmaddubsw m4, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m3 mova [tmpq+16*1], m4 punpcklbw m3, m2, m1 punpckhbw m4, m2, m1 - pmaddubsw m3, m5 - pmaddubsw m4, m5 + PMADDUBSW m3, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m3 mova [tmpq+16*3], m4 movu m3, [srcq+stride3q ] @@ -1059,14 +1161,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 add tmpq, 16*8 punpcklbw m1, m3, m2 punpckhbw m4, m3, m2 - pmaddubsw m1, m5 - pmaddubsw m4, m5 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq-16*4], m1 mova [tmpq-16*3], m4 punpcklbw m1, m0, m3 punpckhbw m2, m0, m3 - pmaddubsw m1, m5 - pmaddubsw m2, m5 + PMADDUBSW m1, m5, m6, m7, 0 + PMADDUBSW m2, m5, m6, m7, 0 mova [tmpq-16*2], m1 mova [tmpq-16*1], m2 sub hd, 4 @@ -1075,6 +1177,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 .v_w32: lea t2d, [hq+(0<<16)] mov t0d, 64 + jmp .v_w32_start +.v_w64: + lea t2d, [hq+(1<<16)] + mov t0d, 128 + jmp .v_w32_start +.v_w128: + lea t2d, [hq+(3<<16)] + mov t0d, 256 .v_w32_start: %if ARCH_X86_64 %if WIN64 @@ -1083,43 +1193,43 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov r7, tmpq %endif mov t1, srcq -.v_w32_loop_h: - movu m0, [srcq+strideq*0+16*0] ; 0L - movu m1, [srcq+strideq*0+16*1] ; 0U -.v_w32_loop_v: - movu m2, [srcq+strideq*1+16*0] ; 1L - movu m3, [srcq+strideq*1+16*1] ; 1U +.v_w32_hloop: + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] +.v_w32_vloop: + movu m2, [srcq+strideq*1+16*0] + movu m3, [srcq+strideq*1+16*1] lea srcq, [srcq+strideq*2] punpcklbw m4, m2, m0 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpckhbw m4, m2, m0 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m4 punpcklbw m4, m3, m1 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpckhbw m4, m3, m1 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m4 add tmpq, t0q - movu m0, [srcq+strideq*0+16*0] ; 2L - movu m1, [srcq+strideq*0+16*1] ; 2U + movu m0, [srcq+strideq*0+16*0] + movu m1, [srcq+strideq*0+16*1] punpcklbw m4, m0, m2 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*0], m4 punpckhbw m4, m0, m2 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*1], m4 punpcklbw m4, m1, m3 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*2], m4 punpckhbw m4, m1, m3 - pmaddubsw m4, m5 + PMADDUBSW m4, m5, m6, m7, 0 mova [tmpq+16*3], m4 add tmpq, t0q sub hd, 2 - jg .v_w32_loop_v + jg .v_w32_vloop movzx hd, t2w add t1, 32 mov srcq, t1 @@ -1132,62 +1242,78 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov tmpmp, tmpq %endif sub t2d, 1<<16 - jg .v_w32_loop_h + jg .v_w32_hloop %if WIN64 POP r7 %endif RET -.v_w64: - lea t2d, [hq+(1<<16)] - mov t0d, 128 - jmp .v_w32_start -.v_w128: - lea t2d, [hq+(3<<16)] - mov t0d, 256 - jmp .v_w32_start .hv: ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) - %assign stack_offset stack_offset - stack_size_padded - WIN64_SPILL_XMM 8 +%assign stack_offset stack_offset - stack_size_padded +%if cpuflag(ssse3) + WIN64_SPILL_XMM 8 +%else + WIN64_SPILL_XMM 10 +%endif movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)] +%if cpuflag(ssse3) shl mxyd, 11 - movd xm6, mxyd +%else + %if ARCH_X86_64 + mova m8, [pw_8] + %else + %define m8 [pw_8] + %endif + pxor m7, m7 +%endif + movd m6, mxyd add wq, t2 pshuflw m6, m6, q0000 +%if cpuflag(ssse3) punpcklqdq m6, m6 +%else + %if ARCH_X86_64 + psrlw m0, m8, 3 + punpcklwd m6, m0 + %else + punpcklwd m6, [base+pw_1] + %endif +%endif %if ARCH_X86_32 mov t1, t2 ; save base reg for w4 %endif lea stride3q, [strideq*3] jmp wq .hv_w4: -%if ARCH_X86_32 +%if cpuflag(ssse3) + %if ARCH_X86_32 mova m4, [t1-prep_ssse3+bilin_h_shuf4] -%else + %else mova m4, [bilin_h_shuf4] + %endif %endif - movq m0, [srcq+strideq*0] ; 0 _ - punpcklqdq m0, m0 - pshufb m0, m4 - pmaddubsw m0, m5 + movhps m0, [srcq+strideq*0] + PSHUFB_BILIN_H4 m0, m4, m3 + PMADDUBSW m0, m5, m7, m4, 0 ; _ 0 .hv_w4_loop: movq m1, [srcq+strideq*1] - movhps m1, [srcq+strideq*2] ; 1 _ 2 _ + movhps m1, [srcq+strideq*2] movq m2, [srcq+stride3q ] lea srcq, [srcq+strideq*4] - movhps m2, [srcq+strideq*0] ; 3 _ 4 _ - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 ; 1 + 2 + - shufpd m3, m0, m1, 0x01 ; 0 + 1 + - pmaddubsw m0, m2, m5 ; 3 + 4 + - shufpd m2, m1, m0, 0x01 ; 2 + 3 + + movhps m2, [srcq+strideq*0] + PSHUFB_BILIN_H4 m1, m4, m3 + PSHUFB_BILIN_H4 m2, m4, m3 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 2 + shufpd m3, m0, m1, 0x01 ; 0 1 + mova m0, m2 + PMADDUBSW m0, m5, m7, m4, 0 ; 3 4 + shufpd m2, m1, m0, 0x01 ; 2 3 psubw m1, m3 - pmulhrsw m1, m6 + PMULHRSW m1, m6, m4, m8, 4 paddw m1, m3 psubw m3, m0, m2 - pmulhrsw m3, m6 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m2 mova [tmpq+16*0], m1 mova [tmpq+16*1], m3 @@ -1196,46 +1322,74 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 jg .hv_w4_loop RET .hv_w8: - movu m0, [srcq+strideq*0] - pshufb m0, m4 - pmaddubsw m0, m5 ; 0 + + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0 .hv_w8_loop: - movu m1, [srcq+strideq*1] ; 1 - movu m2, [srcq+strideq*2] ; 2 - pshufb m1, m4 - pshufb m2, m4 - pmaddubsw m1, m5 ; 1 + - pmaddubsw m2, m5 ; 2 + - psubw m3, m1, m0 ; 1-0 - pmulhrsw m3, m6 + movu m1, [srcq+strideq*1] + movu m2, [srcq+strideq*2] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 1 + PMADDUBSW m2, m5, m7, m4, 0 ; 2 + psubw m3, m1, m0 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 - psubw m7, m2, m1 ; 2-1 - pmulhrsw m7, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m9, m7 +%endif + psubw m7, m2, m1 + PMULHRSW m7, m6, m4, m8, 4 paddw m7, m1 mova [tmpq+16*0], m3 mova [tmpq+16*1], m7 - movu m1, [srcq+stride3q ] ; 3 - lea srcq, [srcq+strideq*4] - movu m0, [srcq+strideq*0] ; 4 - pshufb m1, m4 - pshufb m0, m4 - pmaddubsw m1, m5 ; 3 + - pmaddubsw m0, m5 ; 4 + - psubw m3, m1, m2 ; 3-2 - pmulhrsw m3, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m7, m9 +%endif + movu m1, [srcq+stride3q ] + lea srcq, [srcq+strideq*4] + movu m0, [srcq+strideq*0] + PSHUFB_BILIN_H8 m1, m4 + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m1, m5, m7, m4, ARCH_X86_32 ; 3 + PMADDUBSW m0, m5, m7, m4, 0 ; 4 + psubw m3, m1, m2 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m2 - psubw m7, m0, m1 ; 4-3 - pmulhrsw m7, m6 +%if notcpuflag(ssse3) && ARCH_X86_64 + SWAP m9, m7 +%endif + psubw m7, m0, m1 + PMULHRSW m7, m6, m4, m8, 4 paddw m7, m1 mova [tmpq+16*2], m3 mova [tmpq+16*3], m7 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m7, m9 + %else + pxor m7, m7 + %endif +%endif add tmpq, 16*4 sub hd, 4 jg .hv_w8_loop RET .hv_w16: - lea t2d, [hq+(0<<16)] + mov t2d, hd mov t0d, 32 + jmp .hv_w16_start +.hv_w32: + lea t2d, [hq+(1<<16)] + mov t0d, 64 + jmp .hv_w16_start +.hv_w64: + lea t2d, [hq+(3<<16)] + mov t0d, 128 + jmp .hv_w16_start +.hv_w128: + lea t2d, [hq+(7<<16)] + mov t0d, 256 .hv_w16_start: %if ARCH_X86_64 %if WIN64 @@ -1244,47 +1398,47 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov r7, tmpq %endif mov t1, srcq -.hv_w16_loop_h: - movu m0, [srcq+strideq*0+8*0] ; 0L - movu m1, [srcq+strideq*0+8*1] ; 0U - pshufb m0, m4 - pshufb m1, m4 - pmaddubsw m0, m5 ; 0L + - pmaddubsw m1, m5 ; 0U + -.hv_w16_loop_v: - movu m2, [srcq+strideq*1+8*0] ; 1L - pshufb m2, m4 - pmaddubsw m2, m5 ; 1L + - psubw m3, m2, m0 ; 1L-0L - pmulhrsw m3, m6 +.hv_w16_hloop: + movu m0, [srcq+strideq*0+8*0] + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m0, m4 + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 0a + PMADDUBSW m1, m5, m7, m4, 0 ; 0b +.hv_w16_vloop: + movu m2, [srcq+strideq*1+8*0] + PSHUFB_BILIN_H8 m2, m4 + PMADDUBSW m2, m5, m7, m4, 0 ; 1a + psubw m3, m2, m0 + PMULHRSW m3, m6, m4, m8, 4 paddw m3, m0 mova [tmpq+16*0], m3 - movu m3, [srcq+strideq*1+8*1] ; 1U - lea srcq, [srcq+strideq*2] - pshufb m3, m4 - pmaddubsw m3, m5 ; 1U + - psubw m0, m3, m1 ; 1U-0U - pmulhrsw m0, m6 + movu m3, [srcq+strideq*1+8*1] + lea srcq, [srcq+strideq*2] + PSHUFB_BILIN_H8 m3, m4 + PMADDUBSW m3, m5, m7, m4, 0 ; 1b + psubw m0, m3, m1 + PMULHRSW m0, m6, m4, m8, 4 paddw m0, m1 mova [tmpq+16*1], m0 add tmpq, t0q - movu m0, [srcq+strideq*0+8*0] ; 2L - pshufb m0, m4 - pmaddubsw m0, m5 ; 2L + - psubw m1, m0, m2 ; 2L-1L - pmulhrsw m1, m6 + movu m0, [srcq+strideq*0+8*0] + PSHUFB_BILIN_H8 m0, m4 + PMADDUBSW m0, m5, m7, m4, 0 ; 2a + psubw m1, m0, m2 + PMULHRSW m1, m6, m4, m8, 4 paddw m1, m2 mova [tmpq+16*0], m1 - movu m1, [srcq+strideq*0+8*1] ; 2U - pshufb m1, m4 - pmaddubsw m1, m5 ; 2U + - psubw m2, m1, m3 ; 2U-1U - pmulhrsw m2, m6 + movu m1, [srcq+strideq*0+8*1] + PSHUFB_BILIN_H8 m1, m4 + PMADDUBSW m1, m5, m7, m4, 0 ; 2b + psubw m2, m1, m3 + PMULHRSW m2, m6, m4, m8, 4 paddw m2, m3 mova [tmpq+16*1], m2 add tmpq, t0q sub hd, 2 - jg .hv_w16_loop_v + jg .hv_w16_vloop movzx hd, t2w add t1, 16 mov srcq, t1 @@ -1297,23 +1451,12 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3 mov tmpmp, tmpq %endif sub t2d, 1<<16 - jg .hv_w16_loop_h + jg .hv_w16_hloop %if WIN64 POP r7 %endif RET -.hv_w32: - lea t2d, [hq+(1<<16)] - mov t0d, 64 - jmp .hv_w16_start -.hv_w64: - lea t2d, [hq+(3<<16)] - mov t0d, 128 - jmp .hv_w16_start -.hv_w128: - lea t2d, [hq+(7<<16)] - mov t0d, 256 - jmp .hv_w16_start +%endmacro ; int8_t subpel_filters[5][15][8] %assign FILTER_REGULAR (0*15 << 16) | 3*15 @@ -2430,58 +2573,250 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3 jg .hv_w8_loop0 RET -%if ARCH_X86_32 -DECLARE_REG_TMP 1, 2 -%elif WIN64 -DECLARE_REG_TMP 6, 4 -%else -DECLARE_REG_TMP 6, 7 -%endif -%macro PREP_8TAP_FN 3 ; type, type_h, type_v -cglobal prep_8tap_%1 - mov t0d, FILTER_%2 - mov t1d, FILTER_%3 -%ifnidn %1, sharp_smooth ; skip the jump in the last filter - jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) -%endif +%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask + %if cpuflag(ssse3) + pshufb %1, %2 + %else + %if %5 == 1 + pcmpeqd %2, %2 + psrlq %2, 32 + %endif + psrldq %3, %1, 1 + pshufd %3, %3, q2301 + pand %1, %2 + pandn %4, %2, %3 + por %1, %4 + %endif %endmacro -PREP_8TAP_FN regular, REGULAR, REGULAR -PREP_8TAP_FN regular_sharp, REGULAR, SHARP -PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH -PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR -PREP_8TAP_FN smooth, SMOOTH, SMOOTH -PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP -PREP_8TAP_FN sharp_regular, SHARP, REGULAR -PREP_8TAP_FN sharp, SHARP, SHARP -PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH +%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %ifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro -%if ARCH_X86_32 - %define base_reg r2 - %define base base_reg-prep_ssse3 - %define W32_RESTORE_SSQ mov strideq, stridem -%else - %define base_reg r7 - %define base 0 - %define W32_RESTORE_SSQ -%endif +%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask + %if notcpuflag(ssse3) + psrlq %1, %2, 16 + %elifnidn %1, %2 + mova %1, %2 + %endif + PSHUFB_SUBPEL_H_4 %1, %3, %4, %5, %6 +%endmacro -cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 -%assign org_stack_offset stack_offset - imul mxd, mxm, 0x010101 +%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp] + %if cpuflag(ssse3) + palignr %1, %2, %3, %4 + %else + %if %0 == 4 + %assign %%i regnumof%+%1 + 1 + %define %%tmp m %+ %%i + %else + %define %%tmp %5 + %endif + psrldq %1, %3, %4 + pslldq %%tmp, %2, 16-%4 + por %1, %%tmp + %endif +%endmacro + +%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1 + %if cpuflag(ssse3) + phaddw %1, %2 + %else + %ifnidn %1, %2 + %if %4 == 1 + mova %3, [pw_1] + %endif + pmaddwd %1, %3 + pmaddwd %2, %3 + packssdw %1, %2 + %else + %if %4 == 1 + pmaddwd %1, [pw_1] + %else + pmaddwd %1, %3 + %endif + packssdw %1, %1 + %endif + %endif +%endmacro + +%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift + %if cpuflag(ssse3) + pmulhrsw %1, %2, %3 + %else + paddw %1, %2, %3 + psraw %1, %4 + %endif +%endmacro + +%macro PMULHRSW_8192 3 ; dst, src1, src2 + PMULHRSW_POW2 %1, %2, %3, 2 +%endmacro + +%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2] + movd %1, [%2+0] + movd %3, [%2+1] + movd %4, [%2+2] + movd %5, [%2+3] + punpckldq %1, %3 + punpckldq %4, %5 + punpcklqdq %1, %4 +%endmacro + +%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc + %if cpuflag(ssse3) + movu m%1, [%2] + pshufb m2, m%1, m11 ; subpel_h_shufB + pshufb m3, m%1, m9 ; subpel_h_shufC + pshufb m%1, m10 ; subpel_h_shufA + %else + %if ARCH_X86_64 + SWAP m12, m5 + SWAP m13, m6 + SWAP m14, m7 + %define %%mx0 m%+%%i + %define %%mx1 m%+%%j + %assign %%i 0 + %rep 12 + movd %%mx0, [%2+%%i] + %assign %%i %%i+1 + %endrep + %assign %%i 0 + %rep 6 + %assign %%j %%i+1 + punpckldq %%mx0, %%mx1 + %assign %%i %%i+2 + %endrep + %assign %%i 0 + %rep 3 + %assign %%j %%i+2 + punpcklqdq %%mx0, %%mx1 + %assign %%i %%i+4 + %endrep + SWAP m%1, m0 + SWAP m2, m4 + SWAP m3, m8 + SWAP m5, m12 + SWAP m6, m13 + SWAP m7, m14 + %else + PREP_8TAP_H_LOAD4 m0, %2+0, m1, m4, m7 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, m4, m7 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, m4, m7 + SWAP m%1, m0 + %endif + %endif +%endmacro + +%macro PREP_8TAP_H 2 ; dst, src_memloc + PREP_8TAP_H_LOAD %1, %2 + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m8, m1 + SWAP m9, m7 + %endif + %xdefine mX m%+%1 + %assign %%i regnumof%+mX + %define mX m%+%%i + mova m4, m2 + PMADDUBSW m4, m5, m1, m7, 1 ; subpel +0 B0 + PMADDUBSW m2, m6, m1, m7, 0 ; subpel +4 B4 + PMADDUBSW m3, m6, m1, m7, 0 ; subpel +4 C4 + PMADDUBSW mX, m5, m1, m7, 0 ; subpel +0 A0 + %undef mX + %if ARCH_X86_64 && notcpuflag(ssse3) + SWAP m1, m8 + SWAP m7, m9 + %endif + paddw m3, m4 + paddw m%1, m2 + PHADDW m%1, m3, m15, ARCH_X86_32 + %if ARCH_X86_64 || cpuflag(ssse3) + PMULHRSW_8192 m%1, m%1, m7 + %else + PMULHRSW_8192 m%1, m%1, [base+pw_2] + %endif +%endmacro + +%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2] + %if cpuflag(ssse3) + movu %1, [%2] + pshufb m2, %1, shufB + pshufb m3, %1, shufC + pshufb %1, shufA + %else + PREP_8TAP_H_LOAD4 %1, %2+0, m1, %3, %4 + PREP_8TAP_H_LOAD4 m2, %2+4, m1, %3, %4 + PREP_8TAP_H_LOAD4 m3, %2+8, m1, %3, %4 + %endif +%endmacro + +%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2] + PREP_8TAP_HV_LOAD %{1:4} + mova m1, m2 + PMADDUBSW m1, subpelh0, %3, %4, 1 ; subpel +0 C0 + PMADDUBSW m3, subpelh1, %3, %4, 0 ; subpel +4 B4 + PMADDUBSW m2, subpelh1, %3, %4, 0 ; C4 + PMADDUBSW %1, subpelh0, %3, %4, 0 ; A0 + paddw m1, m3 ; C0+B4 + paddw %1, m2 ; A0+C4 + PHADDW %1, m1, %3, 1 +%endmacro + +%macro PREP_8TAP_FN 3 ; type, type_h, type_v +cglobal prep_8tap_%1 + mov t0d, FILTER_%2 + mov t1d, FILTER_%3 +%ifnidn %1, sharp_smooth ; skip the jump in the last filter + jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX) +%endif +%endmacro + +%macro PREP_8TAP 0 +%if ARCH_X86_32 + DECLARE_REG_TMP 1, 2 +%elif WIN64 + DECLARE_REG_TMP 6, 4 +%else + DECLARE_REG_TMP 6, 7 +%endif +PREP_8TAP_FN regular, REGULAR, REGULAR +PREP_8TAP_FN regular_sharp, REGULAR, SHARP +PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH +PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR +PREP_8TAP_FN smooth, SMOOTH, SMOOTH +PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP +PREP_8TAP_FN sharp_regular, SHARP, REGULAR +PREP_8TAP_FN sharp, SHARP, SHARP +PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH + +%if ARCH_X86_32 + %define base_reg r2 + %define base base_reg-prep%+SUFFIX + %define W32_RESTORE_SSQ mov strideq, stridem +%else + %define base_reg r7 + %define base 0 + %define W32_RESTORE_SSQ +%endif +cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 +%assign org_stack_offset stack_offset + imul mxd, mxm, 0x010101 add mxd, t0d ; 8tap_h, mx, 4tap_h imul myd, mym, 0x010101 add myd, t1d ; 8tap_v, my, 4tap_v movsxd wq, wm movifnidn srcd, srcm movifnidn hd, hm - LEA base_reg, prep_ssse3 test mxd, 0xf00 jnz .h test myd, 0xf00 jnz .v + LEA base_reg, prep_ssse3 tzcnt wd, wd - movzx wd, word [base_reg+wq*2+table_offset(prep,)] + movzx wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2] add wq, base_reg movifnidn strided, stridem lea r6, [strideq*3] @@ -2492,25 +2827,49 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif jmp wq .h: + LEA base_reg, prep%+SUFFIX test myd, 0xf00 jnz .hv +%if cpuflag(ssse3) WIN64_SPILL_XMM 12 +%else + WIN64_SPILL_XMM 16 +%endif cmp wd, 4 je .h_w4 tzcnt wd, wd -%if ARCH_X86_64 +%if cpuflag(ssse3) + %if ARCH_X86_64 mova m10, [base+subpel_h_shufA] mova m11, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] + %else + %define m10 [base+subpel_h_shufA] + %define m11 [base+subpel_h_shufB] + %define m9 [base+subpel_h_shufC] + %endif %endif shr mxd, 16 sub srcq, 3 movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)] - movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0] + movd m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0] pshufd m5, m5, q0000 - movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4] + movd m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4] pshufd m6, m6, q0000 +%if cpuflag(ssse3) mova m7, [base+pw_8192] +%else + punpcklbw m5, m5 + punpcklbw m6, m6 + psraw m5, 8 + psraw m6, 8 + %if ARCH_X86_64 + mova m7, [pw_2] + mova m15, [pw_1] + %else + %define m15 m4 + %endif +%endif add wq, base_reg jmp wq .h_w4: @@ -2520,39 +2879,115 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movzx mxd, mxb %endif dec srcq - movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + movd m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] pshufd m4, m4, q0000 +%if cpuflag(ssse3) mova m6, [base+pw_8192] mova m5, [base+subpel_h_shufA] +%else + mova m6, [base+pw_2] + %if ARCH_X86_64 + mova m14, [pw_1] + %else + %define m14 m7 + %endif + punpcklbw m4, m4 + psraw m4, 8 +%endif W32_RESTORE_SSQ %if ARCH_X86_64 lea stride3q, [strideq*3] %endif .h_w4_loop: +%if cpuflag(ssse3) movq m0, [srcq+strideq*0] ; 0 movq m1, [srcq+strideq*1] ; 1 -%if ARCH_X86_32 + %if ARCH_X86_32 lea srcq, [srcq+strideq*2] movq m2, [srcq+strideq*0] ; 2 movq m3, [srcq+strideq*1] ; 3 lea srcq, [srcq+strideq*2] -%else + %else movq m2, [srcq+strideq*2] ; 2 movq m3, [srcq+stride3q ] ; 3 lea srcq, [srcq+strideq*4] -%endif - pshufb m0, m5 ; subpel_h_shufA + %endif + pshufb m0, m5 pshufb m1, m5 pshufb m2, m5 pshufb m3, m5 - pmaddubsw m0, m4 ; subpel_filters + 2 - pmaddubsw m1, m4 - pmaddubsw m2, m4 - pmaddubsw m3, m4 - phaddw m0, m1 - phaddw m2, m3 - pmulhrsw m0, m6 ; pw_8192 - pmulhrsw m2, m6 ; pw_8192 +%else + %if ARCH_X86_64 + movd m0, [srcq+strideq*0+0] + movd m12, [srcq+strideq*0+1] + movd m1, [srcq+strideq*1+0] + movd m5, [srcq+strideq*1+1] + movd m2, [srcq+strideq*2+0] + movd m13, [srcq+strideq*2+1] + movd m3, [srcq+stride3q +0] + movd m7, [srcq+stride3q +1] + punpckldq m0, m12 + punpckldq m1, m5 + punpckldq m2, m13 + punpckldq m3, m7 + movd m12, [srcq+strideq*0+2] + movd m8, [srcq+strideq*0+3] + movd m5, [srcq+strideq*1+2] + movd m9, [srcq+strideq*1+3] + movd m13, [srcq+strideq*2+2] + movd m10, [srcq+strideq*2+3] + movd m7, [srcq+stride3q +2] + movd m11, [srcq+stride3q +3] + lea srcq, [srcq+strideq*4] + punpckldq m12, m8 + punpckldq m5, m9 + punpckldq m13, m10 + punpckldq m7, m11 + punpcklqdq m0, m12 ; 0 + punpcklqdq m1, m5 ; 1 + punpcklqdq m2, m13 ; 2 + punpcklqdq m3, m7 ; 3 + %else + movd m0, [srcq+strideq*0+0] + movd m1, [srcq+strideq*0+1] + movd m2, [srcq+strideq*0+2] + movd m3, [srcq+strideq*0+3] + punpckldq m0, m1 + punpckldq m2, m3 + punpcklqdq m0, m2 ; 0 + movd m1, [srcq+strideq*1+0] + movd m2, [srcq+strideq*1+1] + movd m3, [srcq+strideq*1+2] + movd m7, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m1, m2 + punpckldq m3, m7 + punpcklqdq m1, m3 ; 1 + movd m2, [srcq+strideq*0+0] + movd m3, [srcq+strideq*0+1] + movd m7, [srcq+strideq*0+2] + movd m5, [srcq+strideq*0+3] + punpckldq m2, m3 + punpckldq m7, m5 + punpcklqdq m2, m7 ; 2 + movd m3, [srcq+strideq*1+0] + movd m7, [srcq+strideq*1+1] + punpckldq m3, m7 + movd m7, [srcq+strideq*1+2] + movd m5, [srcq+strideq*1+3] + lea srcq, [srcq+strideq*2] + punpckldq m7, m5 + punpcklqdq m3, m7 ; 3 + %endif +%endif + PMADDUBSW m0, m4, m5, m7, 1 ; subpel_filters + 2 + PMADDUBSW m1, m4, m5, m7, 0 + PMADDUBSW m2, m4, m5, m7, 0 + PMADDUBSW m3, m4, m5, m7, 0 + PHADDW m0, m1, m14, ARCH_X86_32 + PHADDW m2, m3, m14, 0 + PMULHRSW_8192 m0, m0, m6 + PMULHRSW_8192 m2, m2, m6 mova [tmpq+16*0], m0 mova [tmpq+16*1], m2 add tmpq, 32 @@ -2560,55 +2995,41 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jg .h_w4_loop RET ; -%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3] -%if ARCH_X86_32 - pshufb %2, %1, [base+subpel_h_shufB] - pshufb %3, %1, [base+subpel_h_shufC] - pshufb %1, [base+subpel_h_shufA] -%else - pshufb %2, %1, m11; subpel_h_shufB - pshufb %3, %1, m9 ; subpel_h_shufC - pshufb %1, m10 ; subpel_h_shufA -%endif - pmaddubsw %4, %2, m5 ; subpel +0 B0 - pmaddubsw %2, m6 ; subpel +4 B4 - pmaddubsw %3, m6 ; subpel +4 C4 - pmaddubsw %1, m5 ; subpel +0 A0 - paddw %3, %4 - paddw %1, %2 - phaddw %1, %3 - pmulhrsw %1, m7 ; 8192 -%endmacro - ; .h_w8: %if ARCH_X86_32 mov r3, r2 - %define base_reg r3 + %define base_reg r3 W32_RESTORE_SSQ %endif .h_w8_loop: - movu m0, [srcq+strideq*0] - movu m1, [srcq+strideq*1] - lea srcq, [srcq+strideq*2] - PREP_8TAP_H m0, m2, m3, m4 - PREP_8TAP_H m1, m2, m3, m4 +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+strideq*0 + PREP_8TAP_H 1, srcq+strideq*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 + lea srcq, [srcq+strideq*2] add tmpq, 32 sub hd, 2 +%else + PREP_8TAP_H 0, srcq + mova [tmpq], m0 + add srcq, strideq + add tmpq, 16 + dec hd +%endif jg .h_w8_loop RET .h_w16: - xor r6d, r6d + mov r6, -16*1 jmp .h_start .h_w32: - mov r6, -16*1 + mov r6, -16*2 jmp .h_start .h_w64: - mov r6, -16*3 + mov r6, -16*4 jmp .h_start .h_w128: - mov r6, -16*7 + mov r6, -16*8 .h_start: %if ARCH_X86_32 mov r3, r2 @@ -2618,15 +3039,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mov r5, r6 W32_RESTORE_SSQ .h_loop: - movu m0, [srcq+r6+8*0] - movu m1, [srcq+r6+8*1] - PREP_8TAP_H m0, m2, m3, m4 - PREP_8TAP_H m1, m2, m3, m4 +%if cpuflag(ssse3) + PREP_8TAP_H 0, srcq+r6+8*0 + PREP_8TAP_H 1, srcq+r6+8*1 mova [tmpq+16*0], m0 mova [tmpq+16*1], m1 add tmpq, 32 add r6, 16 - jle .h_loop +%else + PREP_8TAP_H 0, srcq+r6 + mova [tmpq], m0 + add tmpq, 16 + add r6, 8 +%endif + jl .h_loop add srcq, strideq mov r6, r5 dec hd @@ -2635,8 +3061,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %if ARCH_X86_32 %define base_reg r2 %endif - + ; .v: + LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd and mxd, 0x7f @@ -2648,30 +3075,40 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3] + lea myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] +%if cpuflag(ssse3) mova m2, [base+pw_512] psrlw m2, m2, 1 ; 0x0100 mova m7, [base+pw_8192] +%endif %if ARCH_X86_32 %define subpel0 [rsp+mmsize*0] %define subpel1 [rsp+mmsize*1] %define subpel2 [rsp+mmsize*2] %define subpel3 [rsp+mmsize*3] %assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed + %if cpuflag(ssse3) ALLOC_STACK -mmsize*4 + %else + ALLOC_STACK -mmsize*5 + %endif %assign regs_used 7 movd m0, [myq+0] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel0, m0 movd m0, [myq+2] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel1, m0 movd m0, [myq+4] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel2, m0 movd m0, [myq+6] - pshufb m0, m2 + PSHUFB_0X1X m0, m2 mova subpel3, m0 + %if notcpuflag(ssse3) + mov r6, base_reg + %define base_reg r6 + %endif mov strideq, [rstk+stack_offset+gprsize*3] lea strideq, [strideq*3] sub [rstk+stack_offset+gprsize*2], strideq @@ -2683,25 +3120,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpel2 m10 %define subpel3 m11 movd subpel0, [myq+0] - pshufb subpel0, m2 + PSHUFB_0X1X subpel0, m2 movd subpel1, [myq+2] - pshufb subpel1, m2 + PSHUFB_0X1X subpel1, m2 movd subpel2, [myq+4] - pshufb subpel2, m2 + PSHUFB_0X1X subpel2, m2 movd subpel3, [myq+6] - pshufb subpel3, m2 + PSHUFB_0X1X subpel3, m2 lea stride3q, [strideq*3] sub srcq, stride3q cmp wd, 8 - jg .v_w16 - je .v_w8 + jns .v_w8 %endif .v_w4: -%if ARCH_X86_32 -%if STACK_ALIGNMENT < mmsize - %define srcm [rsp+mmsize*4+gprsize*1] - %define tmpm [rsp+mmsize*4+gprsize*2] +%if notcpuflag(ssse3) + pxor m6, m6 + %if ARCH_X86_64 + mova m7, [base+pw_2] + %endif %endif +%if ARCH_X86_32 + %if STACK_ALIGNMENT < mmsize + %define srcm [esp+stack_size+gprsize*1] + %define tmpm [esp+stack_size+gprsize*2] + %endif mov tmpm, tmpq mov srcm, srcq lea r5d, [wq - 4] ; horizontal loop @@ -2734,17 +3176,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endif punpckldq m3, m1 ; 4 5 _ _ punpckldq m1, m0 ; 5 6 _ _ - palignr m4, m3, m2, 4 ; 1 2 3 4 + PALIGNR m4, m3, m2, 4 ; 1 2 3 4 punpcklbw m3, m1 ; 45 56 punpcklbw m1, m2, m4 ; 01 12 punpckhbw m2, m4 ; 23 34 .v_w4_loop: - pmaddubsw m5, m1, subpel0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel0 + %define subpel0 m7 +%endif + mova m5, m1 + PMADDUBSW m5, subpel0, m6, m4, 0 ; a0 b0 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel1 + %define subpel1 m7 +%endif mova m1, m2 - pmaddubsw m2, subpel1 ; a1 b1 + PMADDUBSW m2, subpel1, m6, m4, 0 ; a1 b1 paddw m5, m2 +%if ARCH_X86_32 && notcpuflag(ssse3) + mova m7, subpel2 + %define subpel2 m7 +%endif mova m2, m3 - pmaddubsw m3, subpel2 ; a2 b2 + PMADDUBSW m3, subpel2, m6, m4, 0 ; a2 b2 paddw m5, m3 movd m4, [srcq+strideq*0] punpckldq m3, m0, m4 ; 6 7 _ _ @@ -2752,9 +3207,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 lea srcq, [srcq+strideq*2] punpckldq m4, m0 ; 7 8 _ _ punpcklbw m3, m4 ; 67 78 - pmaddubsw m4, m3, subpel3 ; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m12, m0 + %else + mova [esp+mmsize*4], m0 + mova m7, subpel3 + %define subpel3 m7 + %endif +%endif + mova m4, m3 + PMADDUBSW m4, subpel3, m6, m0, 0 ; a3 b3 paddw m5, m4 - pmulhrsw m5, m7 +%if ARCH_X86_64 || cpuflag(ssse3) + %if notcpuflag(ssse3) + SWAP m0, m12 + %endif + PMULHRSW_8192 m5, m5, m7 +%else + mova m0, [esp+mmsize*4] + PMULHRSW_8192 m5, m5, [base+pw_2] +%endif movq [tmpq+wq*0], m5 movhps [tmpq+wq*2], m5 lea tmpq, [tmpq+wq*4] @@ -2772,26 +3245,28 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 jg .v_w4_loop0 %endif RET - +%if ARCH_X86_32 && notcpuflag(ssse3) + %define base_reg r2 +%endif + ; %if ARCH_X86_64 .v_w8: -.v_w16: lea r5d, [wq - 8] ; horizontal loop mov r8, tmpq mov r6, srcq shl r5d, 8 - 3; (wq / 8) << 8 mov r5b, hb .v_w8_loop0: - movq m4, [srcq+strideq*0] ; 0 - movq m5, [srcq+strideq*1] ; 1 + movq m4, [srcq+strideq*0] + movq m5, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m6, [srcq+strideq*0] ; 2 - movq m0, [srcq+strideq*1] ; 3 + movq m6, [srcq+strideq*0] + movq m0, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movq m1, [srcq+strideq*0] + movq m2, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m1, [srcq+strideq*0] ; 4 - movq m2, [srcq+strideq*1] ; 5 - lea srcq, [srcq+strideq*2] ; - movq m3, [srcq+strideq*0] ; 6 + movq m3, [srcq+strideq*0] shufpd m4, m0, 0x0c shufpd m5, m1, 0x0c punpcklbw m1, m4, m5 ; 01 @@ -2803,9 +3278,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 punpcklbw m3, m6, m0 ; 23 punpckhbw m6, m0 ; 56 .v_w8_loop: - movq m12, [srcq+strideq*1] ; 8 +%if cpuflag(ssse3) + movq m12, [srcq+strideq*1] lea srcq, [srcq+strideq*2] - movq m13, [srcq+strideq*0] ; 9 + movq m13, [srcq+strideq*0] pmaddubsw m14, m1, subpel0 ; a0 pmaddubsw m15, m2, subpel0 ; b0 mova m1, m3 @@ -2830,8 +3306,43 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddw m15, m13 pmulhrsw m14, m7 pmulhrsw m15, m7 - movu [tmpq+wq*0], xm14 - movu [tmpq+wq*2], xm15 + movu [tmpq+wq*0], m14 + movu [tmpq+wq*2], m15 +%else + mova m14, m1 + PMADDUBSW m14, subpel0, m7, m12, 1 ; a0 + mova m1, m3 + PMADDUBSW m3, subpel1, m7, m12, 0 ; a1 + paddw m14, m3 + mova m3, m5 + PMADDUBSW m5, subpel2, m7, m12, 0 ; a2 + paddw m14, m5 + movq m12, [srcq+strideq*1] + lea srcq, [srcq+strideq*2] + movq m13, [srcq+strideq*0] + shufpd m15, m0, m12, 0x0d + shufpd m0, m12, m13, 0x0c + punpcklbw m5, m15, m0 ; 67 + punpckhbw m15, m0 ; 78 + mova m13, m5 + PMADDUBSW m13, subpel3, m7, m12, 0 ; a3 + paddw m14, m13 + PMULHRSW_8192 m14, m14, [base+pw_2] + movu [tmpq+wq*0], m14 + mova m14, m2 + PMADDUBSW m14, subpel0, m7, m12, 0 ; b0 + mova m2, m4 + PMADDUBSW m4, subpel1, m7, m12, 0 ; b1 + paddw m14, m4 + mova m4, m6 + PMADDUBSW m6, subpel2, m7, m12, 0 ; b2 + paddw m14, m6 + mova m6, m15 + PMADDUBSW m15, subpel3, m7, m12, 0 ; b3 + paddw m14, m15 + PMULHRSW_8192 m14, m14, [base+pw_2] + movu [tmpq+wq*2], m14 +%endif lea tmpq, [tmpq+wq*4] sub hd, 2 jg .v_w8_loop @@ -2848,20 +3359,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpel1 %undef subpel2 %undef subpel3 - + ; .hv: %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 and mxd, 0x7f - movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2] + movd m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2] %if ARCH_X86_32 mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] mov r5, r2; use as new base %define base_reg r5 %assign regs_used 2 @@ -2877,7 +3388,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv2 [rsp+mmsize*2] %define subpelv3 [rsp+mmsize*3] punpcklbw m0, m0 - psraw m0, 8 ; sign-extend + psraw m0, 8 pshufd m6, m0, q0000 mova subpelv0, m6 pshufd m6, m0, q1111 @@ -2891,8 +3402,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] + %if cpuflag(ssse3) ALLOC_STACK mmsize*14, 14 + %else + ALLOC_STACK mmsize*14, 16 + %endif lea stride3q, [strideq*3] sub srcq, stride3q dec srcq @@ -2901,8 +3416,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv2 m12 %define subpelv3 m13 punpcklbw m0, m0 - psraw m0, 8 ; sign-extend + psraw m0, 8 + %if cpuflag(ssse3) mova m8, [base+pw_8192] + %else + mova m8, [base+pw_2] + %endif mova m9, [base+pd_32] pshufd m10, m0, q0000 pshufd m11, m0, q1111 @@ -2910,7 +3429,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 pshufd m13, m0, q3333 %endif pshufd m7, m1, q0000 -.hv_w4: +%if notcpuflag(ssse3) + punpcklbw m7, m7 + psraw m7, 8 +%endif %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 @@ -2921,17 +3443,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define hv4_line_1_1 11 %define hv4_line_1_2 12 %define hv4_line_1_3 13 - ; - ; %if ARCH_X86_32 - %define w8192reg [base+pw_8192] + %if cpuflag(ssse3) + %define w8192reg [base+pw_8192] + %else + %define w8192reg [base+pw_2] + %endif %define d32reg [base+pd_32] %else %define w8192reg m8 %define d32reg m9 %endif ; lower shuffle 0 1 2 3 4 +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%else + %if ARCH_X86_64 + mova m15, [pw_1] + %else + %define m15 m1 + %endif +%endif movq m5, [srcq+strideq*0] ; 0 _ _ _ movhps m5, [srcq+strideq*1] ; 0 _ 1 _ movq m4, [srcq+strideq*2] ; 2 _ _ _ @@ -2944,43 +3476,61 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 movhps m4, [srcq+stride3q ] ; 2 _ 3 _ lea srcq, [srcq+strideq*4] %endif - pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ - pmaddubsw m2, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m2, m0 ;H 0 1 2 3 - pmulhrsw m2, w8192reg ;H pw_8192 + PSHUFB_SUBPEL_H_4a m2, m5, m6, m1, m3, 1 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg SAVELINE_W4 m2, 2, 0 ; upper shuffle 2 3 4 5 6 +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] - pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~ - pmaddubsw m2, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m2, m0 ;H 0 1 2 3 - pmulhrsw m2, w8192reg ;H pw_8192 - ; +%endif + PSHUFB_SUBPEL_H_4b m2, m5, m6, m1, m3, 0 ;H subpel_h_shuf4 0~1~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m3, 0 ;H subpel_h_shuf4 2~3~ + PMADDUBSW m2, m7, m1, m3, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m3, 0 ;H subpel_filters + PHADDW m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3 + PMULHRSW_8192 m2, m2, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m2 + %else + mova [esp+mmsize*4], m2 + %endif +%endif ; lower shuffle +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%endif movq m5, [srcq+strideq*0] ; 4 _ _ _ movhps m5, [srcq+strideq*1] ; 4 _ 5 _ movq m4, [srcq+strideq*2] ; 6 _ _ _ - pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ - pmaddubsw m3, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m3, m0 ;H 4 5 6 7 - pmulhrsw m3, w8192reg ;H pw_8192 + PSHUFB_SUBPEL_H_4a m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4a m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg SAVELINE_W4 m3, 3, 0 ; upper shuffle +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] - pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~ - pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~ - pmaddubsw m3, m7 ;H subpel_filters - pmaddubsw m0, m7 ;H subpel_filters - phaddw m3, m0 ;H 4 5 6 7 - pmulhrsw m3, w8192reg ;H pw_8192 - ; +%endif + PSHUFB_SUBPEL_H_4b m3, m5, m6, m1, m2, 0 ;H subpel_h_shuf4 4~5~ + PSHUFB_SUBPEL_H_4b m0, m4, m6, m1, m2, 0 ;H subpel_h_shuf4 6~6~ + PMADDUBSW m3, m7, m1, m2, 1 ;H subpel_filters + PMADDUBSW m0, m7, m1, m2, 0 ;H subpel_filters + PHADDW m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7 + PMULHRSW_8192 m3, m3, w8192reg +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m2, m14 + %else + mova m2, [esp+mmsize*4] + %endif +%endif %if ARCH_X86_32 lea srcq, [srcq+strideq*2] add srcq, strideq @@ -2988,7 +3538,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 add srcq, stride3q %endif ;process high - palignr m4, m3, m2, 4;V 1 2 3 4 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3000,7 +3550,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 ;process low RESTORELINE_W4 m2, 2, 0 RESTORELINE_W4 m3, 3, 0 - palignr m4, m3, m2, 4;V 1 2 3 4 + PALIGNR m4, m3, m2, 4;V 1 2 3 4 punpcklwd m1, m2, m4 ; V 01 12 punpckhwd m2, m4 ; V 23 34 pshufd m0, m3, q2121;V 5 6 5 6 @@ -3014,18 +3564,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+mmsize*4], m5 + %define m15 m3 + %endif +%endif ; +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4] +%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ - pmaddubsw m4, m7 ;H subpel_filters - phaddw m4, m4 ;H 7 8 7 8 - pmulhrsw m4, w8192reg ;H pw_8192 - palignr m3, m4, m0, 12 ; 6 7 8 7 + PSHUFB_SUBPEL_H_4a m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+mmsize*4] + %endif +%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m5, 6 @@ -3046,18 +3613,34 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 mova m2, m3 pmaddwd m3, subpelv2; V a2 b2 paddd m5, m3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m14, m5 + %else + mova [esp+0xA0], m5 + %endif +%endif ; +%if cpuflag(ssse3) mova m6, [base+subpel_h_shuf4+16] +%endif movq m4, [srcq+strideq*0] ; 7 movhps m4, [srcq+strideq*1] ; 7 _ 8 _ - pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~ - pmaddubsw m4, m7 ;H subpel_filters - phaddw m4, m4 ;H 7 8 7 8 - pmulhrsw m4, w8192reg ;H pw_8192 - palignr m3, m4, m0, 12 ; 6 7 8 7 + PSHUFB_SUBPEL_H_4b m4, m4, m6, m3, m5, 0 ; H subpel_h_shuf4 7~8~ + PMADDUBSW m4, m7, m3, m5, 1 ; H subpel_filters + PHADDW m4, m4, m15, ARCH_X86_32 ; H 7878 + PMULHRSW_8192 m4, m4, w8192reg + PALIGNR m3, m4, m0, 12, m5 ; 6787 mova m0, m4 punpcklwd m3, m4 ; 67 78 pmaddwd m4, m3, subpelv3; a3 b3 +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m5, m14 + %else + mova m5, [esp+0xA0] + %endif +%endif paddd m5, d32reg ; pd_32 paddd m5, m4 psrad m4, m5, 6 @@ -3084,8 +3667,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %undef subpelv2 %undef subpelv3 ; - - .hv_w8: %assign stack_offset org_stack_offset %define hv8_line_1 0 @@ -3104,27 +3685,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 [rsp+mmsize*10] %define accuv0 [rsp+mmsize*11] %define accuv1 [rsp+mmsize*12] - movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movq m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] mov mxd, myd shr myd, 16 and mxd, 0x7f cmp hd, 6 cmovs myd, mxd - movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] ALLOC_STACK -mmsize*13 -%if STACK_ALIGNMENT < mmsize + %if STACK_ALIGNMENT < mmsize mov rstk, r2m - %define tmpm [rsp+mmsize*13+gprsize*1] - %define srcm [rsp+mmsize*13+gprsize*2] - %define stridem [rsp+mmsize*13+gprsize*3] + %define tmpm [rsp+mmsize*13+gprsize*1] + %define srcm [rsp+mmsize*13+gprsize*2] + %define stridem [rsp+mmsize*13+gprsize*3] mov stridem, rstk -%endif + %endif mov r6, r2 -%define base_reg r6 + %define base_reg r6 pshufd m0, m1, q0000 pshufd m1, m1, q1111 punpcklbw m5, m5 - psraw m5, 8 ; sign-extend + %if notcpuflag(ssse3) + punpcklbw m0, m0 + punpcklbw m1, m1 + %endif + psraw m5, 8 + %if notcpuflag(ssse3) + psraw m0, 8 + psraw m1, 8 + %endif pshufd m2, m5, q0000 pshufd m3, m5, q1111 pshufd m4, m5, q2222 @@ -3151,20 +3740,31 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %define subpelv3 m15 %define accuv0 m8 %define accuv1 m9 - movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3] + movq m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX] movzx mxd, myb shr myd, 16 cmp hd, 6 cmovs myd, mxd - movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3] + movq m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX] pshufd subpelh0, m0, q0000 pshufd subpelh1, m0, q1111 punpcklbw m1, m1 - psraw m1, 8 ; sign-extend + %if notcpuflag(ssse3) + punpcklbw subpelh0, subpelh0 + punpcklbw subpelh1, subpelh1 + %endif + psraw m1, 8 + %if notcpuflag(ssse3) + psraw subpelh0, 8 + psraw subpelh1, 8 + %endif pshufd subpelv0, m1, q0000 pshufd subpelv1, m1, q1111 pshufd subpelv2, m1, q2222 pshufd subpelv3, m1, q3333 + %if notcpuflag(ssse3) + mova m7, [base+pw_2] + %endif lea stride3q, [strideq*3] sub srcq, 3 sub srcq, stride3q @@ -3179,57 +3779,89 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 shl r5d, (16 - 2) mov r5w, hw .hv_w8_loop0: - movu m4, [srcq+strideq*0] ; 0 = _ _ - movu m5, [srcq+strideq*1] ; 1 = _ _ - lea srcq, [srcq+strideq*2] -%if ARCH_X86_64 +%if cpuflag(ssse3) + %if ARCH_X86_64 mova m7, [base+subpel_h_shufA] mova m8, [base+subpel_h_shufB] mova m9, [base+subpel_h_shufC] + %define shufA m7 + %define shufB m8 + %define shufC m9 + %else + %define shufA [base+subpel_h_shufA] + %define shufB [base+subpel_h_shufB] + %define shufC [base+subpel_h_shufC] + %endif %endif - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~ - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~ - movu m6, [srcq+strideq*0] ; 2 = _ _ - movu m0, [srcq+strideq*1] ; 3 = _ _ + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 lea srcq, [srcq+strideq*2] - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~ - HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~ - ; +%if notcpuflag(ssse3) + %if ARCH_X86_64 + SWAP m9, m4 + %else + mova [esp], m4 + %endif +%endif + PREP_8TAP_HV m6, srcq+strideq*0, m7, m4 + PREP_8TAP_HV m0, srcq+strideq*1, m7, m4 + lea srcq, [srcq+strideq*2] +%if cpuflag(ssse3) mova m7, [base+pw_8192] - pmulhrsw m4, m7 ; H pw_8192 - pmulhrsw m5, m7 ; H pw_8192 - pmulhrsw m6, m7 ; H pw_8192 - pmulhrsw m0, m7 ; H pw_8192 - punpcklwd m1, m4, m5 ; 0 1 ~ - punpcklwd m2, m5, m6 ; 1 2 ~ - punpcklwd m3, m6, m0 ; 2 3 ~ +%else + mova m7, [base+pw_2] + %if ARCH_X86_64 + SWAP m4, m9 + %else + mova m4, [esp] + %endif +%endif + PMULHRSW_8192 m4, m4, m7 + PMULHRSW_8192 m5, m5, m7 + PMULHRSW_8192 m6, m6, m7 + PMULHRSW_8192 m0, m0, m7 + punpcklwd m1, m4, m5 ; 01 + punpcklwd m2, m5, m6 ; 12 + punpcklwd m3, m6, m0 ; 23 SAVELINE_W8 1, m1 SAVELINE_W8 2, m2 SAVELINE_W8 3, m3 - ; +%if cpuflag(ssse3) mova m7, [base+subpel_h_shufA] - movu m4, [srcq+strideq*0] ; 4 = _ _ - movu m5, [srcq+strideq*1] ; 5 = _ _ +%else + %if ARCH_X86_64 + SWAP m8, m7 + SWAP m9, m0 + %else + mova [esp+0x30], m0 + %endif +%endif + PREP_8TAP_HV m4, srcq+strideq*0, m7, m0 + PREP_8TAP_HV m5, srcq+strideq*1, m7, m0 + PREP_8TAP_HV m6, srcq+strideq*2, m7, m0 lea srcq, [srcq+strideq*2] - movu m6, [srcq+strideq*0] ; 6 = _ _ - HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~ - HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~ - HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~ +%if cpuflag(ssse3) mova m7, [base+pw_8192] - pmulhrsw m1, m4, m7 ; H pw_8192 4 ~ - pmulhrsw m2, m5, m7 ; H pw_8192 5 ~ - pmulhrsw m3, m6, m7 ; H pw_8192 6 ~ - punpcklwd m4, m0, m1 ; 3 4 ~ - punpcklwd m5, m1, m2 ; 4 5 ~ - punpcklwd m6, m2, m3 ; 5 6 ~ - ; +%else + %if ARCH_X86_64 + SWAP m0, m9 + SWAP m7, m8 + %else + mova m0, [esp+0x30] + mova m7, [base+pw_2] + %endif +%endif + PMULHRSW_8192 m1, m4, m7 + PMULHRSW_8192 m2, m5, m7 + PMULHRSW_8192 m3, m6, m7 + punpcklwd m4, m0, m1 ; 34 + punpcklwd m5, m1, m2 ; 45 + punpcklwd m6, m2, m3 ; 56 SAVELINE_W8 6, m3 RESTORELINE_W8 1, m1 RESTORELINE_W8 2, m2 RESTORELINE_W8 3, m3 .hv_w8_loop: - ; m8 accu for V a - ; m9 accu for V b SAVELINE_W8 1, m3 SAVELINE_W8 2, m4 SAVELINE_W8 3, m5 @@ -3246,46 +3878,53 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 paddd m0, m5 paddd m7, m6 mova m5, [base+pd_32] - paddd m0, m5 ; pd_512 - paddd m7, m5 ; pd_512 + paddd m0, m5 + paddd m7, m5 mova accuv0, m0 mova accuv1, m7 %else - pmaddwd m8, m1, subpelv0 ; a0 - pmaddwd m9, m2, subpelv0 ; b0 + pmaddwd accuv0, m1, subpelv0 ; a0 + pmaddwd accuv1, m2, subpelv0 ; b0 pmaddwd m3, subpelv1 ; a1 pmaddwd m4, subpelv1 ; b1 - paddd m8, m3 - paddd m9, m4 + paddd accuv0, m3 + paddd accuv1, m4 pmaddwd m5, subpelv2 ; a2 pmaddwd m6, subpelv2 ; b2 - paddd m8, m5 - paddd m9, m6 + paddd accuv0, m5 + paddd accuv1, m6 mova m7, [base+pd_32] - paddd m8, m7 ; pd_512 - paddd m9, m7 ; pd_512 + paddd accuv0, m7 + paddd accuv1, m7 + %if cpuflag(ssse3) mova m7, [base+subpel_h_shufB] mova m6, [base+subpel_h_shufC] mova m5, [base+subpel_h_shufA] + %define shufA m5 + %define shufB m7 + %define shufC m6 + %endif %endif - movu m0, [srcq+strideq*1] ; 7 - movu m4, [srcq+strideq*2] ; 8 + PREP_8TAP_HV m0, srcq+strideq*1, m5, m6 + PREP_8TAP_HV m4, srcq+strideq*2, m5, m6 lea srcq, [srcq+strideq*2] - HV_H_W8 m0, m1, m2, m3, m5, m7, m6 - HV_H_W8 m4, m1, m2, m3, m5, m7, m6 +%if cpuflag(ssse3) mova m5, [base+pw_8192] - pmulhrsw m0, m5 ; H pw_8192 - pmulhrsw m4, m5 ; H pw_8192 +%else + mova m5, [base+pw_2] +%endif + PMULHRSW_8192 m0, m0, m5 + PMULHRSW_8192 m4, m4, m5 RESTORELINE_W8 6, m6 - punpcklwd m5, m6, m0 ; 6 7 ~ - punpcklwd m6, m0, m4 ; 7 8 ~ + punpcklwd m5, m6, m0 ; 67 + punpcklwd m6, m0, m4 ; 78 pmaddwd m1, m5, subpelv3 ; a3 paddd m2, m1, accuv0 pmaddwd m1, m6, subpelv3 ; b3 - paddd m1, m1, accuv1 ; H + V + paddd m1, m1, accuv1 psrad m2, 6 psrad m1, 6 - packssdw m2, m1 ; d -> w + packssdw m2, m1 movq [tmpq+wq*0], m2 movhps [tmpq+wq*2], m2 lea tmpq, [tmpq+wq*4] @@ -3314,6 +3953,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 sub r5d, 1<<16 jg .hv_w8_loop0 RET +%endmacro %if ARCH_X86_32 %macro SAVE_ALPHA_BETA 0 @@ -3384,7 +4024,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3 %endmacro %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7 - ; Can be done using gathers, but that's terribly slow on many CPU:s %if ARCH_X86_32 %define m8 m4 %define m9 m5 @@ -4022,20 +4661,6 @@ ALIGN function_align ret %endmacro -INIT_XMM sse4 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM ssse3 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM sse2 -WARP_AFFINE_8X8 -WARP_AFFINE_8X8T - -INIT_XMM ssse3 - %if WIN64 DECLARE_REG_TMP 6, 4 %else @@ -4872,8 +5497,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ mov r1, r1m %endif %if %1 - test leftextq, leftextq - jz .body_%3 ; left extension %if ARCH_X86_64 movd m0, [srcq] @@ -4889,7 +5512,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ cmp r3, leftextq jl .left_loop_%3 ; body -.body_%3: lea reg_tmp, [dstq+leftextq] %endif xor r3, r3 @@ -4910,13 +5532,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ jl .body_loop_%3 %if %2 ; right extension - %if ARCH_X86_64 - test rightextq, rightextq - %else - mov r1, r3m - test r1, r1 - %endif - jz .body_loop_end_%3 %if %1 add reg_tmp, centerwq %else @@ -4939,7 +5554,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ cmp r3, r3m %endif jl .right_loop_%3 -.body_loop_end_%3: %endif %if ARCH_X86_64 add dstq, dstrideq @@ -5081,3 +5695,241 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \ %undef reg_dstride %undef reg_blkm %undef reg_tmp + +cextern resize_filter + +%macro SCRATCH 3 +%if ARCH_X86_32 + mova [rsp+%3*mmsize], m%1 +%define m%2 [rsp+%3*mmsize] +%else + SWAP %1, %2 +%endif +%endmacro + +%if ARCH_X86_64 +cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%elif STACK_ALIGNMENT >= 16 +cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%else +cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \ + dst_w, h, src_w, dx, mx0 +%endif + movifnidn dstq, dstmp + movifnidn srcq, srcmp +%if STACK_ALIGNMENT >= 16 + movifnidn dst_wd, dst_wm +%endif +%if ARCH_X86_64 + movifnidn hd, hm +%endif + sub dword mx0m, 4<<14 + sub dword src_wm, 8 + movd m7, dxm + movd m6, mx0m + movd m5, src_wm + pshufd m7, m7, q0000 + pshufd m6, m6, q0000 + pshufd m5, m5, q0000 + +%if ARCH_X86_64 + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr + LEA r7, $$ +%define base r7-$$ +%else + DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x +%if STACK_ALIGNMENT >= 16 + LEA r6, $$ +%define base r6-$$ +%else + LEA r4, $$ +%define base r4-$$ +%endif +%endif + +%if ARCH_X86_64 + mova m12, [base+pw_m256] + mova m11, [base+pd_63] + mova m10, [base+pb_8x0_8x8] +%else +%define m12 [base+pw_m256] +%define m11 [base+pd_63] +%define m10 [base+pb_8x0_8x8] +%endif + pmaddwd m4, m7, [base+resize_mul] ; dx*[0,1,2,3] + pslld m7, 2 ; dx*4 + pslld m5, 14 + paddd m6, m4 ; mx+[0..3]*dx + SCRATCH 7, 15, 0 + SCRATCH 6, 14, 1 + SCRATCH 5, 13, 2 + + ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7 + ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8 + +.loop_y: + xor xd, xd + mova m0, m14 ; per-line working version of mx + +.loop_x: + pxor m1, m1 + pcmpgtd m1, m0 + pandn m1, m0 + psrad m2, m0, 8 ; filter offset (unmasked) + pcmpgtd m3, m13, m1 + pand m1, m3 + pandn m3, m13 + por m1, m3 + psubd m3, m0, m1 ; pshufb offset + psrad m1, 14 ; clipped src_x offset + psrad m3, 14 ; pshufb edge_emu offset + pand m2, m11 ; filter offset (masked) + + ; load source pixels +%if ARCH_X86_64 + movd r8d, xm1 + pshuflw xm1, xm1, q3232 + movd r9d, xm1 + punpckhqdq xm1, xm1 + movd r10d, xm1 + psrlq xm1, 32 + movd r11d, xm1 + movq xm4, [srcq+r8] + movq xm5, [srcq+r10] + movhps xm4, [srcq+r9] + movhps xm5, [srcq+r11] +%else + movd r3d, xm1 + pshufd xm1, xm1, q3312 + movd r1d, xm1 + pshuflw xm1, xm1, q3232 + movq xm4, [srcq+r3] + movq xm5, [srcq+r1] + movd r3d, xm1 + punpckhqdq xm1, xm1 + movd r1d, xm1 + movhps xm4, [srcq+r3] + movhps xm5, [srcq+r1] +%endif + + ; if no emulation is required, we don't need to shuffle or emulate edges + ; this also saves 2 quasi-vpgatherdqs + pxor m6, m6 + pcmpeqb m6, m3 +%if ARCH_X86_64 + pmovmskb r8d, m6 + cmp r8d, 0xffff +%else + pmovmskb r3d, m6 + cmp r3d, 0xffff +%endif + je .filter + +%if ARCH_X86_64 + movd r8d, xm3 + pshuflw xm3, xm3, q3232 + movd r9d, xm3 + punpckhqdq xm3, xm3 + movd r10d, xm3 + psrlq xm3, 32 + movd r11d, xm3 + movsxd r8, r8d + movsxd r9, r9d + movsxd r10, r10d + movsxd r11, r11d + movq xm6, [base+resize_shuf+4+r8] + movq xm7, [base+resize_shuf+4+r10] + movhps xm6, [base+resize_shuf+4+r9] + movhps xm7, [base+resize_shuf+4+r11] +%else + movd r3d, xm3 + pshufd xm3, xm3, q3312 + movd r1d, xm3 + pshuflw xm3, xm3, q3232 + movq xm6, [base+resize_shuf+4+r3] + movq xm7, [base+resize_shuf+4+r1] + movd r3d, xm3 + punpckhqdq xm3, xm3 + movd r1d, xm3 + movhps xm6, [base+resize_shuf+4+r3] + movhps xm7, [base+resize_shuf+4+r1] +%endif + + paddb m6, m10 + paddb m7, m10 + pshufb m4, m6 + pshufb m5, m7 + +.filter: +%if ARCH_X86_64 + movd r8d, xm2 + pshuflw xm2, xm2, q3232 + movd r9d, xm2 + punpckhqdq xm2, xm2 + movd r10d, xm2 + psrlq xm2, 32 + movd r11d, xm2 + movq xm6, [base+resize_filter+r8*8] + movq xm7, [base+resize_filter+r10*8] + movhps xm6, [base+resize_filter+r9*8] + movhps xm7, [base+resize_filter+r11*8] +%else + movd r3d, xm2 + pshufd xm2, xm2, q3312 + movd r1d, xm2 + pshuflw xm2, xm2, q3232 + movq xm6, [base+resize_filter+r3*8] + movq xm7, [base+resize_filter+r1*8] + movd r3d, xm2 + punpckhqdq xm2, xm2 + movd r1d, xm2 + movhps xm6, [base+resize_filter+r3*8] + movhps xm7, [base+resize_filter+r1*8] +%endif + + pmaddubsw m4, m6 + pmaddubsw m5, m7 + phaddw m4, m5 + phaddsw m4, m4 + pmulhrsw m4, m12 ; x=(x+64)>>7 + packuswb m4, m4 + movd [dstq+xq], m4 + + paddd m0, m15 + add xd, 4 +%if STACK_ALIGNMENT >= 16 + cmp xd, dst_wd +%else + cmp xd, dst_wm +%endif + jl .loop_x + +%if ARCH_X86_64 + add dstq, dst_strideq + add srcq, src_strideq + dec hd +%else + add dstq, dst_stridem + add srcq, src_stridem + dec dword r5m +%endif + jg .loop_y + RET + +INIT_XMM ssse3 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse4 +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T + +INIT_XMM sse2 +PREP_BILIN +PREP_8TAP +WARP_AFFINE_8X8 +WARP_AFFINE_8X8T diff --git a/ffmpeg/JNI/dav1d/src/x86/msac.asm b/ffmpeg/JNI/dav1d/src/x86/msac.asm index f67871483..756e19b4b 100644 --- a/ffmpeg/JNI/dav1d/src/x86/msac.asm +++ b/ffmpeg/JNI/dav1d/src/x86/msac.asm @@ -157,7 +157,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6 mov [t7+msac.rng], t2d not t4 sub t1d, ecx - jge .end ; no refill required + jae .end ; no refill required ; refill: mov t2, [t7+msac.buf] @@ -504,7 +504,7 @@ cglobal msac_decode_bool, 0, 6, 0 mov [t7+msac.rng], t2d not t4 sub t5d, ecx - jge %%end + jae %%end mov t2, [t7+msac.buf] mov rcx, [t7+msac.end] %if UNIX64 == 0 diff --git a/ffmpeg/JNI/dav1d/src/x86/msac_init.c b/ffmpeg/JNI/dav1d/src/x86/msac_init.c index a9dafc757..a634da27c 100644 --- a/ffmpeg/JNI/dav1d/src/x86/msac_init.c +++ b/ffmpeg/JNI/dav1d/src/x86/msac_init.c @@ -28,6 +28,7 @@ #include "src/msac.h" #include "src/x86/msac.h" +#if ARCH_X86_64 void dav1d_msac_init_x86(MsacContext *const s) { const unsigned flags = dav1d_get_cpu_flags(); @@ -39,4 +40,4 @@ void dav1d_msac_init_x86(MsacContext *const s) { s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2; } } - +#endif diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S index 0c4e31f40..a186ef8fc 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S +++ b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S @@ -48,6 +48,8 @@ error_message_gpr: .asciz "failed to preserve register r%d" error_message_vfp: .asciz "failed to preserve register d%d" +error_message_stack: + .asciz "failed to preserve stack" endconst @ max number of args used by any asm function. @@ -55,8 +57,9 @@ endconst #define ARG_STACK 4*(MAX_ARGS - 4) -@ align the used stack space to 8 to preserve the stack alignment -#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed) +@ Align the used stack space to 8 to preserve the stack alignment. +@ +8 for stack canary reference. +#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8) .macro clobbercheck variant .equ pushed, 4*9 @@ -83,14 +86,37 @@ function checked_call_\variant, export=1 .equ pos, pos + 4 .endr + @ For stack overflows, the callee is free to overwrite the parameters + @ that were passed on the stack (if any), so we can only check after + @ that point. First figure out how many parameters the function + @ really took on the stack: + ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)] + @ Load the first non-parameter value from the stack, that should be + @ left untouched by the function. Store a copy of it inverted, so that + @ e.g. overwriting everything with zero would be noticed. + ldr r12, [sp, r12, lsl #2] + mvn r12, r12 + str r12, [sp, #ARG_STACK_A - 4] + mov r12, r0 mov r0, r2 mov r1, r3 ldrd r2, r3, [sp, #ARG_STACK_A + pushed] + @ Call the target function blx r12 - add sp, sp, #ARG_STACK_A + @ Load the number of stack parameters, stack canary and its reference + ldr r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)] + ldr r2, [sp, r12, lsl #2] + ldr r3, [sp, #ARG_STACK_A - 4] + + add sp, sp, #ARG_STACK_A push {r0, r1} + + mvn r3, r3 + cmp r2, r3 + bne 5f + movrel r12, register_init .ifc \variant, vfp .macro check_reg_vfp, dreg, offset @@ -144,6 +170,9 @@ function checked_call_\variant, export=1 .purgem check_reg b 0f +5: + movrel r0, error_message_stack + b 1f 4: movrel r0, error_message_vfp b 1f @@ -154,9 +183,9 @@ function checked_call_\variant, export=1 movrel r0, error_message_gpr 1: #ifdef PREFIX - blx _checkasm_fail_func + bl _checkasm_fail_func #else - blx checkasm_fail_func + bl checkasm_fail_func #endif 0: pop {r0, r1} diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S index 11a342389..25749145a 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S +++ b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S @@ -53,8 +53,10 @@ const register_init, align=4 endconst -const error_message +const error_message_register .asciz "failed to preserve register" +error_message_stack: + .asciz "stack clobbered" endconst @@ -74,7 +76,8 @@ function stack_clobber, export=1 ret endfunc -#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15) +// + 16 for stack canary reference +#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16) function checked_call, export=1 stp x29, x30, [sp, #-16]! @@ -109,22 +112,56 @@ function checked_call, export=1 .equ pos, pos + 8 .endr + // Fill x8-x17 with garbage. This doesn't have to be preserved, + // but avoids relying on them having any particular value. + movrel x9, register_init + ldp x10, x11, [x9], #32 + ldp x12, x13, [x9], #32 + ldp x14, x15, [x9], #32 + ldp x16, x17, [x9], #32 + ldp x8, x9, [x9] + + // For stack overflows, the callee is free to overwrite the parameters + // that were passed on the stack (if any), so we can only check after + // that point. First figure out how many parameters the function + // really took on the stack: + ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8] + // Load the first non-parameter value from the stack, that should be + // left untouched by the function. Store a copy of it inverted, so that + // e.g. overwriting everything with zero would be noticed. + ldr x2, [sp, x2, lsl #3] + mvn x2, x2 + str x2, [sp, #ARG_STACK-8] + + // Load the in-register arguments mov x12, x0 ldp x0, x1, [x29, #16] ldp x2, x3, [x29, #32] ldp x4, x5, [x29, #48] ldp x6, x7, [x29, #64] + // Call the target function blr x12 + + // Load the number of stack parameters, stack canary and its reference + ldr w2, [x29, #16 + 8*8 + (MAX_ARGS-8)*8] + ldr x2, [sp, x2, lsl #3] + ldr x3, [sp, #ARG_STACK-8] + add sp, sp, #ARG_STACK stp x0, x1, [sp, #-16]! + + mvn x3, x3 + cmp x2, x3 + b.ne 2f + movrel x9, register_init movi v3.8h, #0 .macro check_reg_neon reg1, reg2 - ldr q0, [x9], #16 - uzp1 v1.2d, v\reg1\().2d, v\reg2\().2d - eor v0.16b, v0.16b, v1.16b - orr v3.16b, v3.16b, v0.16b + ldr q1, [x9], #16 + uzp1 v2.2d, v\reg1\().2d, v\reg2\().2d + eor v1.16b, v1.16b, v2.16b + orr v3.16b, v3.16b, v1.16b .endm check_reg_neon 8, 9 check_reg_neon 10, 11 @@ -148,7 +185,11 @@ function checked_call, export=1 cbz x3, 0f - movrel x0, error_message + movrel x0, error_message_register + b 1f +2: + movrel x0, error_message_stack +1: #ifdef PREFIX bl _checkasm_fail_func #else diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c index de9d733e0..ee52c8969 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c @@ -125,7 +125,7 @@ typedef struct CheckasmFunc { struct CheckasmFunc *child[2]; CheckasmFuncVersion versions; uint8_t color; /* 0 = red, 1 = black */ - char name[1]; + char name[]; } CheckasmFunc; /* Internal state */ @@ -142,7 +142,7 @@ static struct { unsigned cpu_flag; const char *cpu_flag_name; const char *test_name; - unsigned int seed; + unsigned seed; int bench_c; int verbose; int function_listing; @@ -159,7 +159,7 @@ typedef union { static uint32_t xs_state[4]; -static void xor128_srand(unsigned int seed) { +static void xor128_srand(unsigned seed) { xs_state[0] = seed; xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff); xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff); @@ -335,15 +335,15 @@ static int cmp_nop(const void *a, const void *b) { /* Measure the overhead of the timing code (in decicycles) */ static int measure_nop_time(void) { uint16_t nops[10000]; - int i, nop_sum = 0; + int nop_sum = 0; - for (i = 0; i < 10000; i++) { + for (int i = 0; i < 10000; i++) { uint64_t t = readtime(); nops[i] = (uint16_t) (readtime() - t); } qsort(nops, 10000, sizeof(uint16_t), cmp_nop); - for (i = 2500; i < 7500; i++) + for (int i = 2500; i < 7500; i++) nop_sum += nops[i]; return nop_sum / 500; @@ -359,8 +359,8 @@ static void print_benchs(const CheckasmFunc *const f) { const CheckasmFuncVersion *v = &f->versions; do { if (v->iterations) { - int decicycles = (int) (10*v->cycles/v->iterations - - state.nop_time) / 4; + const int decicycles = (int) (10*v->cycles/v->iterations - + state.nop_time) / 4; printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu), decicycles/10, decicycles%10); } @@ -413,7 +413,7 @@ static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) { #define is_red(f) ((f) && !(f)->color) /* Balance a left-leaning red-black tree at the specified node */ -static void balance_tree(CheckasmFunc **root) { +static void balance_tree(CheckasmFunc **const root) { CheckasmFunc *const f = *root; if (is_red(f->child[0]) && is_red(f->child[1])) { @@ -427,12 +427,12 @@ static void balance_tree(CheckasmFunc **root) { } /* Get a node with the specified name, creating it if it doesn't exist */ -static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) { +static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) { CheckasmFunc *f = *root; if (f) { /* Search the tree for a matching node */ - int cmp = cmp_func_names(name, f->name); + const int cmp = cmp_func_names(name, f->name); if (cmp) { f = get_func(&f->child[cmp > 0], name); @@ -442,9 +442,9 @@ static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) { } } else { /* Allocate and insert a new node into the tree */ - const size_t name_length = strlen(name); - f = *root = checkasm_malloc(sizeof(CheckasmFunc) + name_length); - memcpy(f->name, name, name_length + 1); + const size_t name_length = strlen(name) + 1; + f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length); + memcpy(f->name, name, name_length); } return f; @@ -559,28 +559,29 @@ int main(int argc, char *argv[]) { } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) { state.verbose = 1; } else { - state.seed = (unsigned int) strtoul(argv[1], NULL, 10); + state.seed = (unsigned) strtoul(argv[1], NULL, 10); } argc--; argv++; } - fprintf(stderr, "checkasm: using random seed %u\n", state.seed); - dav1d_init_cpu(); + + if (!state.function_listing) { + fprintf(stderr, "checkasm: using random seed %u\n", state.seed); #if ARCH_X86_64 - void checkasm_warmup_avx2(void); - void checkasm_warmup_avx512(void); - unsigned cpu_flags = dav1d_get_cpu_flags(); - if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL) - state.simd_warmup = checkasm_warmup_avx512; - else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2) - state.simd_warmup = checkasm_warmup_avx2; - else - state.simd_warmup = NULL; - checkasm_simd_warmup(); + void checkasm_warmup_avx2(void); + void checkasm_warmup_avx512(void); + const unsigned cpu_flags = dav1d_get_cpu_flags(); + if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL) + state.simd_warmup = checkasm_warmup_avx512; + else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2) + state.simd_warmup = checkasm_warmup_avx2; + checkasm_simd_warmup(); #endif + } + check_cpu_flag(NULL, 0); if (state.function_listing) { diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h index c5191e242..27c28d7d2 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h +++ b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h @@ -201,23 +201,33 @@ void checkasm_stack_clobber(uint64_t clobber, ...); * those registers to keep them powered on. */ void checkasm_simd_warmup(void); #define declare_new(ret, ...)\ - ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\ + ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\ + int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ (void *)checkasm_checked_call; #define CLOB (UINT64_C(0xdeadbeefdeadbeef)) +#ifdef _WIN32 +#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0 +#else +#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0 +#endif #define call_new(...)\ (checkasm_set_signal_handler_state(1),\ checkasm_simd_warmup(),\ checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\ - checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__));\ + checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\ checkasm_set_signal_handler_state(0) #elif ARCH_X86_32 #define declare_new(ret, ...)\ - ret (*checked_call)(void *, __VA_ARGS__) = (void *)checkasm_checked_call; + ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\ + int, int, int, int, int, int, int, int, int) =\ + (void *)checkasm_checked_call; #define call_new(...)\ (checkasm_set_signal_handler_state(1),\ - checked_call(func_new, __VA_ARGS__));\ + checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\ checkasm_set_signal_handler_state(0) #elif ARCH_ARM /* Use a dummy argument, to offset the real parameters by 2, not only 1. @@ -225,17 +235,20 @@ void checkasm_simd_warmup(void); * the same even when the extra parameters have been removed. */ void checkasm_checked_call_vfp(void *func, int dummy, ...); #define declare_new(ret, ...)\ - ret (*checked_call)(void *, int dummy, __VA_ARGS__) =\ + ret (*checked_call)(void *, int dummy, __VA_ARGS__,\ + int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ (void *)checkasm_checked_call_vfp; #define call_new(...)\ (checkasm_set_signal_handler_state(1),\ - checked_call(func_new, 0, __VA_ARGS__));\ + checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\ checkasm_set_signal_handler_state(0) #elif ARCH_AARCH64 && !defined(__APPLE__) void checkasm_stack_clobber(uint64_t clobber, ...); #define declare_new(ret, ...)\ ret (*checked_call)(void *, int, int, int, int, int, int, int,\ - __VA_ARGS__) =\ + __VA_ARGS__, int, int, int, int, int, int, int, int,\ + int, int, int, int, int, int, int) =\ (void *)checkasm_checked_call; #define CLOB (UINT64_C(0xdeadbeefdeadbeef)) #define call_new(...)\ @@ -244,7 +257,8 @@ void checkasm_stack_clobber(uint64_t clobber, ...); CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\ CLOB, CLOB, CLOB, CLOB, CLOB),\ - checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__));\ + checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\ + 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\ checkasm_set_signal_handler_state(0) #else #define declare_new(ret, ...) @@ -270,8 +284,8 @@ void checkasm_stack_clobber(uint64_t clobber, ...); checkasm_set_signal_handler_state(1);\ func_type *tfunc = func_new;\ uint64_t tsum = 0;\ - int ti, tcount = 0;\ - for (ti = 0; ti < BENCH_RUNS; ti++) {\ + int tcount = 0;\ + for (int ti = 0; ti < BENCH_RUNS; ti++) {\ uint64_t t = readtime();\ tfunc(__VA_ARGS__);\ tfunc(__VA_ARGS__);\ diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c b/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c index 5c9e0bfc2..1219ee7c8 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c @@ -181,8 +181,8 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { const int w = 1 + (rnd() & 127); const int h = 1 + (rnd() & 31); - for (int y = 0; y < h; y++) - for (int x = 0; x < w; x++) + for (int y = 0; y < 32; y++) + for (int x = 0; x < 128; x++) src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max; const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0; @@ -260,13 +260,12 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) { const int w = 1 + (rnd() & (127 >> ss_x)); const int h = 1 + (rnd() & (31 >> ss_y)); - const int lw = w << ss_x, lh = h << ss_y; - for (int y = 0; y < h; y++) - for (int x = 0; x < w; x++) + for (int y = 0; y < 32; y++) + for (int x = 0; x < 128; x++) src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max; - for (int y = 0; y < lh; y++) - for (int x = 0; x < lw; x++) + for (int y = 0; y < 32; y++) + for (int x = 0; x < 128; x++) luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max; const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0; diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c b/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c index d5955d20e..6b054a700 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c @@ -75,59 +75,66 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) { int width, int height, int angle, int max_width, int max_height HIGHBD_DECL_SUFFIX); - for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) - for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1) - if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc", - intra_pred_mode_names[mode], w, BITDEPTH)) - { - for (int h = imax(w / 4, 4); h <= imin(w * 4, - (mode == FILTER_PRED ? 32 : 64)); h <<= 1) + for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) { + int bpc_min = BITDEPTH, bpc_max = BITDEPTH; + if (mode == FILTER_PRED && BITDEPTH == 16) { + bpc_min = 10; + bpc_max = 12; + } + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) + for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1) + if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc", + intra_pred_mode_names[mode], w, bpc)) { - const ptrdiff_t stride = w * sizeof(pixel); - - int a = 0, maxw = 0, maxh = 0; - if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */ - a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) | - (rnd() & 0x600); - if (mode == Z2_PRED) { - maxw = rnd(), maxh = rnd(); - maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1)); - maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1)); + for (int h = imax(w / 4, 4); h <= imin(w * 4, + (mode == FILTER_PRED ? 32 : 64)); h <<= 1) + { + const ptrdiff_t stride = w * sizeof(pixel); + + int a = 0, maxw = 0, maxh = 0; + if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */ + a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) | + (rnd() & 0x600); + if (mode == Z2_PRED) { + maxw = rnd(), maxh = rnd(); + maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1)); + maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1)); + } + } else if (mode == FILTER_PRED) /* filter_idx */ + a = (rnd() % 5) | (rnd() & ~511); + + int bitdepth_max; + if (bpc == 16) + bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; + else + bitdepth_max = (1 << bpc) - 1; + + for (int i = -h * 2; i <= w * 2; i++) + topleft[i] = rnd() & bitdepth_max; + + call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh + HIGHBD_TAIL_SUFFIX); + call_new(a_dst, stride, topleft, w, h, a, maxw, maxh + HIGHBD_TAIL_SUFFIX); + if (checkasm_check_pixel(c_dst, stride, a_dst, stride, + w, h, "dst")) + { + if (mode == Z1_PRED || mode == Z3_PRED) + fprintf(stderr, "angle = %d (0x%03x)\n", + a & 0x1ff, a & 0x600); + else if (mode == Z2_PRED) + fprintf(stderr, "angle = %d (0x%03x), " + "max_width = %d, max_height = %d\n", + a & 0x1ff, a & 0x600, maxw, maxh); + else if (mode == FILTER_PRED) + fprintf(stderr, "filter_idx = %d\n", a & 0x1ff); } - } else if (mode == FILTER_PRED) /* filter_idx */ - a = (rnd() % 5) | (rnd() & ~511); - -#if BITDEPTH == 16 - const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; -#else - const int bitdepth_max = 0xff; -#endif - - for (int i = -h * 2; i <= w * 2; i++) - topleft[i] = rnd() & bitdepth_max; - call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh - HIGHBD_TAIL_SUFFIX); - call_new(a_dst, stride, topleft, w, h, a, maxw, maxh - HIGHBD_TAIL_SUFFIX); - if (checkasm_check_pixel(c_dst, stride, a_dst, stride, - w, h, "dst")) - { - if (mode == Z1_PRED || mode == Z3_PRED) - fprintf(stderr, "angle = %d (0x%03x)\n", - a & 0x1ff, a & 0x600); - else if (mode == Z2_PRED) - fprintf(stderr, "angle = %d (0x%03x), " - "max_width = %d, max_height = %d\n", - a & 0x1ff, a & 0x600, maxw, maxh); - else if (mode == FILTER_PRED) - fprintf(stderr, "filter_idx = %d\n", a & 0x1ff); + bench_new(a_dst, stride, topleft, w, h, a, 128, 128 + HIGHBD_TAIL_SUFFIX); } - - bench_new(a_dst, stride, topleft, w, h, a, 128, 128 - HIGHBD_TAIL_SUFFIX); } - } + } report("intra_pred"); } @@ -142,14 +149,21 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) { for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) { const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420; const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444; + const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver; for (int w = 4; w <= (32 >> ss_hor); w <<= 1) if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc", cfl_ac_names[layout - 1], w, BITDEPTH)) { - for (int h = imax(w / 4, 4); h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) { + for (int h = imax(w / 4, 4); + h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) + { const ptrdiff_t stride = 32 * sizeof(pixel); - for (int w_pad = (w >> 2) - 1; w_pad >= 0; w_pad--) { - for (int h_pad = (h >> 2) - 1; h_pad >= 0; h_pad--) { + for (int w_pad = imax((w >> 2) - h_step, 0); + w_pad >= 0; w_pad -= h_step) + { + for (int h_pad = imax((h >> 2) - v_step, 0); + h_pad >= 0; h_pad -= v_step) + { #if BITDEPTH == 16 const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; #else diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/itx.c b/ffmpeg/JNI/dav1d/tests/checkasm/itx.c index 9d715c8f8..01f5e0533 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/itx.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/itx.c @@ -223,12 +223,16 @@ static int ftx(coef *const buf, const enum RectTxfmSize tx, } void bitfn(checkasm_check_itx)(void) { - Dav1dInvTxfmDSPContext c; - bitfn(dav1d_itx_dsp_init)(&c); +#if BITDEPTH == 16 + const int bpc_min = 10, bpc_max = 12; +#else + const int bpc_min = 8, bpc_max = 8; +#endif ALIGN_STK_64(coef, coeff, 2, [32 * 32]); ALIGN_STK_64(pixel, c_dst, 64 * 64,); ALIGN_STK_64(pixel, a_dst, 64 * 64,); + Dav1dInvTxfmDSPContext c = { { { 0 } } }; /* Zero unused function pointer elements. */ static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = { TX_4X4, RTX_4X8, RTX_4X16, @@ -250,39 +254,38 @@ void bitfn(checkasm_check_itx)(void) { const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw, dav1d_txfm_dimensions[tx].lh)]; - for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++) - for (int subsh = 0; subsh < subsh_max; subsh++) - if (check_func(c.itxfm_add[tx][txtp], - "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc", - w, h, itx_1d_names[itx_1d_types[txtp][0]], - itx_1d_names[itx_1d_types[txtp][1]], subsh, - BITDEPTH)) - { -#if BITDEPTH == 16 - const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; -#else - const int bitdepth_max = 0xff; -#endif - const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); - memcpy(coeff[1], coeff[0], sizeof(*coeff)); - - for (int j = 0; j < w * h; j++) - c_dst[j] = a_dst[j] = rnd() & bitdepth_max; - - call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob - HIGHBD_TAIL_SUFFIX); - call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob - HIGHBD_TAIL_SUFFIX); - - checkasm_check_pixel(c_dst, w * sizeof(*c_dst), - a_dst, w * sizeof(*a_dst), - w, h, "dst"); - if (memcmp(coeff[0], coeff[1], sizeof(*coeff))) - fail(); - - bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob - HIGHBD_TAIL_SUFFIX); - } + for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) { + bitfn(dav1d_itx_dsp_init)(&c, bpc); + for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++) + for (int subsh = 0; subsh < subsh_max; subsh++) + if (check_func(c.itxfm_add[tx][txtp], + "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc", + w, h, itx_1d_names[itx_1d_types[txtp][0]], + itx_1d_names[itx_1d_types[txtp][1]], subsh, + bpc)) + { + const int bitdepth_max = (1 << bpc) - 1; + const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max); + memcpy(coeff[1], coeff[0], sizeof(*coeff)); + + for (int j = 0; j < w * h; j++) + c_dst[j] = a_dst[j] = rnd() & bitdepth_max; + + call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob + HIGHBD_TAIL_SUFFIX); + call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob + HIGHBD_TAIL_SUFFIX); + + checkasm_check_pixel(c_dst, w * sizeof(*c_dst), + a_dst, w * sizeof(*a_dst), + w, h, "dst"); + if (memcmp(coeff[0], coeff[1], sizeof(*coeff))) + fail(); + + bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob + HIGHBD_TAIL_SUFFIX); + } + } report("add_%dx%d", w, h); } } diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/mc.c b/ffmpeg/JNI/dav1d/tests/checkasm/mc.c index e820bda88..ff8680d10 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/mc.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/mc.c @@ -38,6 +38,7 @@ static const char *const filter_names[] = { }; static const char *const mxy_names[] = { "0", "h", "v", "hv" }; +static const char *const scaled_paths[] = { "", "_dy1", "_dy2" }; static int mc_h_next(const int h) { switch (h) { @@ -161,6 +162,112 @@ static void check_mct(Dav1dMCDSPContext *const c) { report("mct"); } +static void check_mc_scaled(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 263 * 263,); + ALIGN_STK_64(pixel, c_dst, 128 * 128,); + ALIGN_STK_64(pixel, a_dst, 128 * 128,); + const pixel *src = src_buf + 263 * 3 + 3; + const ptrdiff_t src_stride = 263 * sizeof(pixel); +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src, + ptrdiff_t src_stride, int w, int h, + int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 2; w <= 128; w <<= 1) { + const ptrdiff_t dst_stride = w * sizeof(pixel); + for (int p = 0; p < 3; ++p) { + if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc", + filter_names[filter], w, scaled_paths[p], BITDEPTH)) + { + const int h_min = w <= 32 ? 2 : w / 4; + const int h_max = imax(imin(w * 4, 128), 32); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = rnd() % 1024; + const int my = rnd() % 1024; + const int dx = rnd() % 2048 + 1; + const int dy = !p + ? rnd() % 2048 + 1 + : p << 10; // ystep=1.0 and ystep=2.0 paths + + for (int k = 0; k < 263 * 263; k++) + src_buf[k] = rnd() & bitdepth_max; + + call_ref(c_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + call_new(a_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel(c_dst, dst_stride, + a_dst, dst_stride, w, h, "dst"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + bench_new(a_dst, dst_stride, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + } + } + } + } + report("mc_scaled"); +} + +static void check_mct_scaled(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, src_buf, 263 * 263,); + ALIGN_STK_64(int16_t, c_tmp, 128 * 128,); + ALIGN_STK_64(int16_t, a_tmp, 128 * 128,); + const pixel *src = src_buf + 263 * 3 + 3; + const ptrdiff_t src_stride = 263 * sizeof(pixel); +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride, + int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX); + + for (int filter = 0; filter < N_2D_FILTERS; filter++) + for (int w = 4; w <= 128; w <<= 1) + for (int p = 0; p < 3; ++p) { + if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc", + filter_names[filter], w, scaled_paths[p], BITDEPTH)) + { + const int h_min = imax(w / 4, 4); + const int h_max = imin(w * 4, 128); + for (int h = h_min; h <= h_max; h = mc_h_next(h)) { + const int mx = rnd() % 1024; + const int my = rnd() % 1024; + const int dx = rnd() % 2048 + 1; + const int dy = !p + ? rnd() % 2048 + 1 + : p << 10; // ystep=1.0 and ystep=2.0 paths + + for (int k = 0; k < 263 * 263; k++) + src_buf[k] = rnd() & bitdepth_max; + + call_ref(c_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + call_new(a_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp), + a_tmp, w * sizeof(*a_tmp), + w, h, "tmp"); + + if (filter == FILTER_2D_8TAP_REGULAR || + filter == FILTER_2D_BILINEAR) + bench_new(a_tmp, src, src_stride, + w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX); + } + } + } + report("mct_scaled"); +} + static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf, int16_t (*const tmp)[128 * 128], const int bitdepth_max) { @@ -573,12 +680,68 @@ static void check_emuedge(Dav1dMCDSPContext *const c) { report("emu_edge"); } +static int get_upscale_x0(const int in_w, const int out_w, const int step) { + const int err = out_w * step - (in_w << 14); + const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1); + return x0 & 0x3fff; +} + +static void check_resize(Dav1dMCDSPContext *const c) { + ALIGN_STK_64(pixel, c_dst, 1024 * 64,); + ALIGN_STK_64(pixel, a_dst, 1024 * 64,); + ALIGN_STK_64(pixel, src, 512 * 64,); + + const int height = 64; + const int max_src_width = 512; + const ptrdiff_t dst_stride = 1024 * sizeof(pixel); + const ptrdiff_t src_stride = 512 * sizeof(pixel); + + declare_func(void, pixel *dst, ptrdiff_t dst_stride, + const pixel *src, ptrdiff_t src_stride, + int dst_w, int src_w, int h, int dx, int mx0 + HIGHBD_DECL_SUFFIX); + + if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) { +#if BITDEPTH == 16 + const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff; +#else + const int bitdepth_max = 0xff; +#endif + + for (int i = 0; i < max_src_width * height; i++) + src[i] = rnd() & bitdepth_max; + + const int w_den = 9 + (rnd() & 7); + const int src_w = 16 + (rnd() % (max_src_width - 16 + 1)); + const int dst_w = w_den * src_w >> 3; +#define scale_fac(ref_sz, this_sz) \ + ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz)) + const int dx = scale_fac(src_w, dst_w); +#undef scale_fac + const int mx0 = get_upscale_x0(src_w, dst_w, dx); + + call_ref(c_dst, dst_stride, src, src_stride, + dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); + call_new(a_dst, dst_stride, src, src_stride, + dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX); + checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride, + dst_w, height, "dst"); + + bench_new(a_dst, dst_stride, src, src_stride, + 512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX); + } + + report("resize"); +} + void bitfn(checkasm_check_mc)(void) { Dav1dMCDSPContext c; bitfn(dav1d_mc_dsp_init)(&c); check_mc(&c); check_mct(&c); + check_mc_scaled(&c); + check_mct_scaled(&c); check_avg(&c); check_w_avg(&c); check_mask(&c); @@ -589,4 +752,5 @@ void bitfn(checkasm_check_mc)(void) { check_warp8x8(&c); check_warp8x8t(&c); check_emuedge(&c); + check_resize(&c); } diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/msac.c b/ffmpeg/JNI/dav1d/tests/checkasm/msac.c index 482d3af90..cdaf0de81 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/msac.c +++ b/ffmpeg/JNI/dav1d/tests/checkasm/msac.c @@ -239,7 +239,7 @@ void checkasm_check_msac(void) { c.bool = dav1d_msac_decode_bool_c; c.hi_tok = dav1d_msac_decode_hi_tok_c; -#if ARCH_AARCH64 && HAVE_ASM +#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) { c.symbol_adapt4 = dav1d_msac_decode_symbol_adapt4_neon; c.symbol_adapt8 = dav1d_msac_decode_symbol_adapt8_neon; @@ -247,6 +247,7 @@ void checkasm_check_msac(void) { c.bool_adapt = dav1d_msac_decode_bool_adapt_neon; c.bool_equi = dav1d_msac_decode_bool_equi_neon; c.bool = dav1d_msac_decode_bool_neon; + c.hi_tok = dav1d_msac_decode_hi_tok_neon; } #elif ARCH_X86 && HAVE_ASM if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) { diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm b/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm index e00e5bd0d..bc7ec2201 100644 --- a/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm +++ b/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm @@ -27,13 +27,11 @@ %include "config.asm" %include "ext/x86/x86inc.asm" -SECTION_RODATA - -error_message: db "failed to preserve register", 0 +SECTION_RODATA 16 %if ARCH_X86_64 ; just random numbers to reduce the chance of incidental match -ALIGN 16 +%if WIN64 x6: dq 0x1a1b2550a612b48c,0x79445c159ce79064 x7: dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636 x8: dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e @@ -46,6 +44,7 @@ x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5 n7: dq 0x21f86d66c8ca00ce n8: dq 0x75b6ba21077c48ad +%endif n9: dq 0xed56bb2dcb3c7736 n10: dq 0x8bda43d3fd1a7e06 n11: dq 0xb64a9c9e5d318408 @@ -54,6 +53,9 @@ n13: dq 0x4a75479abd64e097 n14: dq 0x249214109d5d1c88 %endif +errmsg_reg: db "failed to preserve register", 0 +errmsg_stack: db "stack corruption", 0 + SECTION .text cextern fail_func @@ -67,7 +69,7 @@ cextern fail_func ;----------------------------------------------------------------------------- ; int checkasm_stack_clobber(uint64_t clobber, ...) ;----------------------------------------------------------------------------- -cglobal stack_clobber, 1,2 +cglobal stack_clobber, 1, 2 ; Clobber the stack with junk below the stack pointer %define argsize (max_args+6)*8 SUB rsp, argsize @@ -81,9 +83,13 @@ cglobal stack_clobber, 1,2 %if WIN64 %assign free_regs 7 + %define stack_param rsp+32 ; shadow space + %define num_stack_params rsp+stack_offset+22*8 DECLARE_REG_TMP 4 %else %assign free_regs 9 + %define stack_param rsp + %define num_stack_params rsp+stack_offset+16*8 DECLARE_REG_TMP 7 %endif @@ -91,7 +97,7 @@ cglobal stack_clobber, 1,2 ; void checkasm_checked_call(void *func, ...) ;----------------------------------------------------------------------------- INIT_XMM -cglobal checked_call, 2,15,16,max_args*8+8 +cglobal checked_call, 2, 15, 16, max_args*8+64+8 mov t0, r0 ; All arguments have been pushed on the stack instead of registers in @@ -104,20 +110,7 @@ cglobal checked_call, 2,15,16,max_args*8+8 %if UNIX64 mov r4, r10mp mov r5, r11mp - %assign i 6 - %rep max_args-6 - mov r9, [rsp+stack_offset+(i+1)*8] - mov [rsp+(i-6)*8], r9 - %assign i i+1 - %endrep %else ; WIN64 - %assign i 4 - %rep max_args-4 - mov r9, [rsp+stack_offset+(i+7)*8] - mov [rsp+i*8], r9 - %assign i i+1 - %endrep - ; Move possible floating-point arguments to the correct registers movq m0, r0 movq m1, r1 @@ -131,22 +124,44 @@ cglobal checked_call, 2,15,16,max_args*8+8 %endrep %endif + ; write stack canaries to the area above parameters passed on the stack + mov r9d, [num_stack_params] + mov r8, [rsp+stack_offset] ; return address + not r8 +%assign i 0 +%rep 8 ; 64 bytes + mov [stack_param+(r9+i)*8], r8 + %assign i i+1 +%endrep + dec r9d + jl .stack_setup_done ; no stack parameters +.copy_stack_parameter: + mov r8, [stack_param+stack_offset+7*8+r9*8] + mov [stack_param+r9*8], r8 + dec r9d + jge .copy_stack_parameter +.stack_setup_done: + %assign i 14 %rep 15-free_regs mov r %+ i, [n %+ i] %assign i i-1 %endrep call t0 -%assign i 14 -%rep 15-free_regs + + ; check for failure to preserve registers + xor r14, [n14] + lea r0, [errmsg_reg] +%assign i 13 +%rep 14-free_regs xor r %+ i, [n %+ i] or r14, r %+ i %assign i i-1 %endrep - %if WIN64 - %assign i 6 - %rep 16-6 + pxor m6, [x6] + %assign i 7 + %rep 16-7 pxor m %+ i, [x %+ i] por m6, m %+ i %assign i i+1 @@ -155,14 +170,30 @@ cglobal checked_call, 2,15,16,max_args*8+8 movq r5, m6 or r14, r5 %endif + jnz .fail - ; Call fail_func() with a descriptive message to mark it as a failure - ; if the called function didn't preserve all callee-saved registers. - ; Save the return value located in rdx:rax first to prevent clobbering. + ; check for stack corruption + mov r9d, [num_stack_params] + mov r8, [rsp+stack_offset] + mov r4, [stack_param+r9*8] + not r8 + xor r4, r8 +%assign i 1 +%rep 6 + mov r5, [stack_param+(r9+i)*8] + xor r5, r8 + or r4, r5 + %assign i i+1 +%endrep + xor r8, [stack_param+(r9+7)*8] + or r4, r8 jz .ok + add r0, errmsg_stack-errmsg_reg +.fail: + ; Call fail_func() with a descriptive message to mark it as a failure. + ; Save the return value located in rdx:rax first to prevent clobbering. mov r9, rax mov r10, rdx - lea r0, [error_message] xor eax, eax call fail_func mov rdx, r10 @@ -186,40 +217,70 @@ WARMUP %else ; just random numbers to reduce the chance of incidental match -%define n3 dword 0x6549315c -%define n4 dword 0xe02f3e23 -%define n5 dword 0xb78d0d1d -%define n6 dword 0x33627ba7 +%assign n3 0x6549315c +%assign n4 0xe02f3e23 +%assign n5 0xb78d0d1d +%assign n6 0x33627ba7 ;----------------------------------------------------------------------------- ; void checkasm_checked_call(void *func, ...) ;----------------------------------------------------------------------------- -cglobal checked_call, 1,7 +cglobal checked_call, 1, 7 + mov r3, [esp+stack_offset] ; return address + mov r1, [esp+stack_offset+17*4] ; num_stack_params + mov r2, 27 + not r3 + sub r2, r1 +.push_canary: + push r3 + dec r2 + jg .push_canary +.push_parameter: + push dword [esp+32*4] + dec r1 + jg .push_parameter mov r3, n3 mov r4, n4 mov r5, n5 mov r6, n6 -%rep max_args - PUSH dword [esp+20+max_args*4] -%endrep call r0 + + ; check for failure to preserve registers xor r3, n3 xor r4, n4 xor r5, n5 xor r6, n6 or r3, r4 or r5, r6 + LEA r1, errmsg_reg or r3, r5 + jnz .fail + + ; check for stack corruption + mov r3, [esp+48*4] ; num_stack_params + mov r6, [esp+31*4] ; return address + mov r4, [esp+r3*4] + sub r3, 26 + not r6 + xor r4, r6 +.check_canary: + mov r5, [esp+(r3+27)*4] + xor r5, r6 + or r4, r5 + inc r3 + jl .check_canary + test r4, r4 jz .ok + add r1, errmsg_stack-errmsg_reg +.fail: mov r3, eax mov r4, edx - LEA r0, error_message - mov [esp], r0 + mov [esp], r1 call fail_func - mov edx, r4 - mov eax, r3 + mov edx, r4 + mov eax, r3 .ok: - add esp, max_args*4 + add esp, 27*4 RET %endif ; ARCH_X86_64 diff --git a/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c b/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c index 9d8b3852a..4506d2f9f 100644 --- a/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c +++ b/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c @@ -69,13 +69,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) dav1d_version(); - // memory sanitizer is inherently incompatible with asm -#if defined(__has_feature) - #if __has_feature(memory_sanitizer) - dav1d_set_cpu_flags_mask(0); - #endif -#endif - if (size < 32) goto end; #ifdef DAV1D_ALLOC_FAIL unsigned h = djb_xor(ptr, 32); diff --git a/ffmpeg/JNI/dav1d/tools/dav1d.c b/ffmpeg/JNI/dav1d/tools/dav1d.c index 97c780146..4b97a9f20 100644 --- a/ffmpeg/JNI/dav1d/tools/dav1d.c +++ b/ffmpeg/JNI/dav1d/tools/dav1d.c @@ -63,7 +63,9 @@ static uint64_t get_time_nanos(void) { QueryPerformanceFrequency(&frequency); LARGE_INTEGER t; QueryPerformanceCounter(&t); - return 1000000000 * t.QuadPart / frequency.QuadPart; + uint64_t seconds = t.QuadPart / frequency.QuadPart; + uint64_t fractions = t.QuadPart % frequency.QuadPart; + return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart; #elif defined(HAVE_CLOCK_GETTIME) struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); @@ -245,7 +247,7 @@ int main(const int argc, char *const *const argv) { if ((res = output_write(out, &p)) < 0) break; n_out++; - if (nspf) { + if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } @@ -282,7 +284,7 @@ int main(const int argc, char *const *const argv) { if ((res = output_write(out, &p)) < 0) break; n_out++; - if (nspf) { + if (nspf || !cli_settings.quiet) { synchronize(cli_settings.realtime, cli_settings.realtime_cache, n_out, nspf, tfirst, &elapsed, frametimes); } diff --git a/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c b/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c index 98b425317..f363033ed 100644 --- a/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c +++ b/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c @@ -84,6 +84,8 @@ static const struct option long_opts[] = { #if ARCH_AARCH64 || ARCH_ARM #define ALLOWED_CPU_MASKS " or 'neon'" +#elif ARCH_PPC64LE +#define ALLOWED_CPU_MASKS " or 'vsx'" #elif ARCH_X86 #define ALLOWED_CPU_MASKS \ ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'" @@ -116,7 +118,7 @@ static void usage(const char *const app, const char *const reason, ...) { " --framethreads $num: number of frame threads (default: 1)\n" " --tilethreads $num: number of tile threads (default: 1)\n" " --filmgrain $num: enable film grain application (default: 1, except if muxer is md5)\n" - " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 32)\n" + " --oppoint $num: select an operating point of a scalable AV1 bitstream (0 - 31)\n" " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n" " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n" " --verify $md5: verify decoded md5. implies --muxer md5, no output\n" @@ -187,6 +189,8 @@ enum CpuMask { static const EnumParseTable cpu_mask_tbl[] = { #if ARCH_AARCH64 || ARCH_ARM { "neon", DAV1D_ARM_CPU_FLAG_NEON }, +#elif ARCH_PPC64LE + { "vsx", DAV1D_PPC_CPU_FLAG_VSX }, #elif ARCH_X86 { "sse2", X86_CPU_MASK_SSE2 }, { "ssse3", X86_CPU_MASK_SSSE3 }, diff --git a/ffmpeg/JNI/dav1d/tools/input/input.c b/ffmpeg/JNI/dav1d/tools/input/input.c index d8a56c182..3ed6983ac 100644 --- a/ffmpeg/JNI/dav1d/tools/input/input.c +++ b/ffmpeg/JNI/dav1d/tools/input/input.c @@ -82,6 +82,10 @@ int input_open(DemuxerContext **const c_out, return DAV1D_ERR(ENOMEM); } FILE *f = fopen(filename, "rb"); + if (!f) { + fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno)); + return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO); + } res = !!fread(probe_data, 1, probe_sz, f); fclose(f); if (!res) { diff --git a/ffmpeg/JNI/dav1d/tools/input/ivf.c b/ffmpeg/JNI/dav1d/tools/input/ivf.c index 746391d4c..7b572ee73 100644 --- a/ffmpeg/JNI/dav1d/tools/input/ivf.c +++ b/ffmpeg/JNI/dav1d/tools/input/ivf.c @@ -28,6 +28,7 @@ #include "config.h" #include +#include #include #include #include @@ -92,8 +93,27 @@ static int ivf_open(IvfInputContext *const c, const char *const file, break; // EOF fseeko(c->f, rl32(data) + 8, SEEK_CUR); } - fps[0] = timebase[0] * *num_frames; - fps[1] = timebase[1] * duration; + + uint64_t fps_num = (uint64_t) timebase[0] * *num_frames; + uint64_t fps_den = (uint64_t) timebase[1] * duration; + if (fps_num && fps_den) { /* Reduce fraction */ + uint64_t gcd = fps_num; + for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b); + fps_num /= gcd; + fps_den /= gcd; + + while ((fps_num | fps_den) > UINT_MAX) { + fps_num >>= 1; + fps_den >>= 1; + } + } + if (fps_num && fps_den) { + fps[0] = (unsigned) fps_num; + fps[1] = (unsigned) fps_den; + } else { + fps[0] = fps[1] = 0; + } + fseeko(c->f, 32, SEEK_SET); return 0;