diff --git a/ffmpeg/JNI/dav1d/.gitignore b/ffmpeg/JNI/dav1d/.gitignore
new file mode 100644
index 000000000..2bbd7c48a
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/.gitignore
@@ -0,0 +1,8 @@
+/build*
+/Session.vim
+[._]*.swp
+*~
+tags
+.DS_Store
+/tests/dav1d-test-data
+*.snap
diff --git a/ffmpeg/JNI/dav1d/.gitlab-ci.yml b/ffmpeg/JNI/dav1d/.gitlab-ci.yml
new file mode 100644
index 000000000..c921b6a12
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/.gitlab-ci.yml
@@ -0,0 +1,579 @@
+stages:
+    - style
+    - build
+    - test
+
+.debian-amd64-common:
+    image: registry.videolan.org/dav1d-debian-unstable:20200602183013
+    stage: build
+    tags:
+        - docker
+        - amd64
+
+.debian-llvm-mingw-common:
+    image: registry.videolan.org/vlc-debian-llvm-mingw:20190218133533
+    stage: build
+    tags:
+        - docker
+        - amd64
+
+.debian-aarch64-common:
+    image: registry.videolan.org/dav1d-debian-buster-aarch64:20200218203017
+    stage: build
+    tags:
+        - docker
+        - aarch64
+
+.debian-armv7-common:
+    image: registry.videolan.org/dav1d-debian-unstable-armv7:20190202101732
+    stage: build
+    tags:
+        - docker
+        - armv7
+
+.debian-ppc64le-common:
+    image: registry.videolan.org/dav1d-debian-unstable-ppc64le:20190606105121
+    stage: build
+    tags:
+        - docker
+        - ppc64le
+
+.ubuntu-common:
+    image: registry.videolan.org/dav1d-ubuntu-bionic:20200121182340
+    stage: build
+    tags:
+        - docker
+        - amd64
+
+.android-common:
+    image: registry.videolan.org/vlc-debian-android:20200323093226
+    stage: build
+    tags:
+        - docker
+        - amd64
+
+
+style-check:
+    extends: .debian-amd64-common
+    stage: style
+    script:
+        - git grep -I -n -P "\t|\r| $" -- . ':(exclude)*/compat/*' && echo "Trailing whitespace" && exit 1
+        - git grep -I -n -i -e 'david' --and --not -e 'copyright' -- . ':(exclude)THANKS.md' ':(exclude).gitlab-ci.yml' && echo "Misspelled dav1d" && exit 1
+        - git grep -I -l -z "" -- . ':(exclude)*/compat/*' | while IFS= read -r -d '' i; do
+              if [ -n "$(tail -c 1 "$i")" ]; then
+                  echo "No newline at end of $i";
+                  exit 1;
+              fi;
+          done
+        - git remote rm upstream 2> /dev/null || true
+        - git remote add upstream https://code.videolan.org/videolan/dav1d.git
+        - git fetch -q upstream master
+        - for i in $(git rev-list HEAD ^upstream/master); do
+              echo "Checking commit message of $i";
+              msg="$(git log --format=%B -n 1 $i)";
+              if [ -n "$(echo "$msg" | awk "NR==2")" ]; then
+                  echo "Malformed commit message in $i, second line must be empty";
+                  exit 1;
+              fi;
+              if echo "$msg" | head -1 | grep -q '\.$'; then
+                  echo "Malformed commit message in $i, trailing period in subject line";
+                  exit 1;
+              fi;
+          done
+
+
+build-debian:
+    extends: .debian-amd64-common
+    tags:
+        - docker
+        - avx2
+        - amd64
+    script:
+        - meson build --buildtype release
+                      --werror
+        - ninja -C build
+        - cd build && meson test -v
+    artifacts:
+        paths:
+            - build/
+        expire_in: 1 day
+
+build-debian-static:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --default-library static
+                      --werror
+        - ninja -C build
+        - cd build && meson test -v
+        - nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -v " _*dav1d_")
+
+build-debian32:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --cross-file package/crossfiles/i686-linux32.meson
+        - ninja -C build
+        - cd build && meson test -v
+    artifacts:
+        paths:
+            - build/
+        expire_in: 1 day
+
+build-debian-examples:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      -Denable_examples=true
+        - ninja -C build
+
+build-win32:
+    extends: .debian-amd64-common
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file package/crossfiles/i686-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+        - ninja -C build install
+        - cd build && meson test -v
+        - i686-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+build-win32-unaligned-stack:
+    extends: .debian-llvm-mingw-common
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      --werror
+                      --cross-file package/crossfiles/i686-w64-mingw32.meson
+                      -Dstack_alignment=4
+        - ninja -C build
+        - cd build && meson test -v
+
+build-win64:
+    extends: .debian-amd64-common
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file package/crossfiles/x86_64-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+        - ninja -C build install
+        - cd build && meson test -v
+        - x86_64-w64-mingw32-nm -A -g src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+build-win-arm32:
+    extends: .debian-llvm-mingw-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/armv7-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+        - armv7-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
+
+build-win-arm64:
+    extends: .debian-llvm-mingw-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file /opt/crossfiles/aarch64-w64-mingw32.meson
+                      -Ddefault_library=both
+        - ninja -C build
+        - ninja -C build install
+        - aarch64-w64-mingw32-nm -A -g build/src/libdav1d.a | grep " [ABCDGRST] " | (! grep -E -v " \.| _*dav1d_")
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+.build-android-common:
+    extends: .android-common
+    script:
+        - meson build --buildtype release
+                      --werror
+                      --libdir lib
+                      --prefix "$(pwd)/build/dav1d_install"
+                      --cross-file $CROSSFILE
+                      -Ddefault_library=both
+        - ninja -C build
+        - ninja -C build install
+
+build-android-armv7:
+    extends: .build-android-common
+    variables:
+        CROSSFILE: package/crossfiles/arm-android.meson
+    except:
+        - tags
+
+build-android-aarch64:
+    extends: .build-android-common
+    variables:
+        CROSSFILE: package/crossfiles/aarch64-android.meson
+    except:
+        - tags
+
+build-android-armv7-release:
+    extends: build-android-armv7
+    except:
+    only:
+        refs:
+            - tags@videolan/dav1d
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+build-android-aarch64-release:
+    extends: build-android-aarch64
+    except:
+    only:
+        refs:
+            - tags@videolan/dav1d
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - build/dav1d_install/
+        expire_in: 1 week
+
+build-debian-aarch64:
+    extends: .debian-aarch64-common
+    script:
+        - meson build --buildtype debugoptimized
+                      --werror
+        - ninja -C build
+        - cd build && meson test -v
+
+build-debian-aarch64-clang-5:
+    extends: .debian-aarch64-common
+    variables:
+        CC: clang-5.0
+        CFLAGS: '-integrated-as'
+    script:
+        - meson build --buildtype release
+        - ninja -C build
+        - cd build && meson test -v
+
+build-macos:
+    stage: build
+    tags:
+        - macos
+    script:
+        - meson build --buildtype release
+                      -Ddefault_library=both
+                      --werror
+        - ninja -C build
+        - cd build && meson test -v
+
+build-debian-werror:
+    extends: .debian-aarch64-common
+    variables:
+        CC: clang-7
+    script:
+        - meson build --buildtype debug
+                      --werror
+        - ninja -C build
+
+build-debian-armv7:
+    extends: .debian-armv7-common
+    script:
+        - linux32 meson build --buildtype debugoptimized
+                              --werror
+        - ninja -C build
+        - cd build && meson test -v
+
+build-debian-armv7-clang-5:
+    extends: .debian-armv7-common
+    variables:
+        CC: clang-5.0
+        CFLAGS: '-integrated-as'
+    script:
+        - linux32 meson build --buildtype release
+        - ninja -C build
+        - cd build && meson test -v
+
+build-ubuntu-snap:
+    extends: .ubuntu-common
+    script:
+        - cd package/snap && snapcraft snap
+        - |
+           if [ "$CI_PROJECT_NAMESPACE" = "videolan" ]; then
+            echo $SNAP_LOGIN | base64 --decode | snapcraft login --with -
+            snapcraft push dav1d_*.snap --release edge
+            snapcraft logout
+           fi
+    artifacts:
+        name: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+        paths:
+            - package/snap/dav1d_*.snap
+        expire_in: 1 week
+    allow_failure: true
+
+build-debian-ppc64le:
+    extends: .debian-ppc64le-common
+    script:
+        - meson build --buildtype release
+                      --werror
+        - ninja -C build
+        - cd build && meson test -v
+
+
+.test-common:
+    stage: test
+    cache:
+        key: testdata.git-20190215
+        paths:
+            - cache/dav1d-test-data.git/
+    before_script:
+        - test -d cache || mkdir cache
+        - test -d cache/dav1d-test-data.git && GIT_DIR=cache/dav1d-test-data.git git fetch --refmap=refs/heads/master:refs/heads/master origin master
+        - test -d cache/dav1d-test-data.git || git clone --bare https://code.videolan.org/videolan/dav1d-test-data.git cache/dav1d-test-data.git
+        - git clone cache/dav1d-test-data.git tests/dav1d-test-data
+    dependencies: []
+
+.test-asm-common:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    tags:
+        - docker
+        - amd64
+        - avx2
+    script:
+        - meson configure build -Dtestdata_tests=true
+        - cd build
+        - exit_code=0
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask 0"     || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse2"  || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask ssse3" || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask sse41" || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--cpumask avx2"  || exit_code=$((exit_code + $?))
+        - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
+
+test-debian:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    script:
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      -Db_coverage=true
+        - ninja -C build
+        - cd build && time meson test -v
+        - ninja coverage-html
+        - mv meson-logs/coveragereport ../coverage
+        - ninja coverage-xml
+        - grep -Eo 'line-rate="[^"]+"' meson-logs/coverage.xml | head -n 1 |
+          grep -Eo '[0-9.]+' | awk '{ print "coverage:", $1 * 100 } '
+    coverage: '/^coverage: (\d+.\d+)$/'
+    artifacts:
+        expose_as: 'Coverage HTML report'
+        paths:
+            - coverage/
+        reports:
+            cobertura: build/meson-logs/coverage.xml
+
+test-debian-asm:
+    extends:
+        - .test-asm-common
+    needs: ["build-debian"]
+    dependencies: ["build-debian"]
+
+test-debian32-asm:
+    extends:
+        - .test-asm-common
+    needs: ["build-debian32"]
+    dependencies: ["build-debian32"]
+
+test-debian-mt:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    dependencies: ["build-debian"]
+    script:
+        - meson configure build -Dtestdata_tests=true
+        - cd build
+        - exit_code=0
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 1 --framethreads 2" || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 1" || exit_code=$((exit_code + $?))
+        - time meson test -q --suite testdata-8 --suite testdata-10 --suite testdata-12 --test-args "--tilethreads 2 --framethreads 2" || exit_code=$((exit_code + $?))
+        - if [ $exit_code -ne 0 ]; then exit $exit_code; fi
+
+test-debian-unaligned-stack:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    tags:
+        - docker
+        - avx2
+        - amd64
+    script:
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      -Dstack_alignment=16
+        - ninja -C build
+        - cd build && time meson test -v
+
+test-debian-asan:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    variables:
+        ASAN_OPTIONS: 'detect_leaks=0'
+    script:
+        - meson build --buildtype debugoptimized
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      -Db_sanitize=address
+                      -Denable_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
+
+test-debian-msan:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    variables:
+        MSAN_OPTIONS: 'exitcode=1'
+        CC: clang
+    script:
+        - meson build --buildtype debugoptimized
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      -Db_sanitize=memory
+                      -Db_lundef=false
+                      -Denable_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
+
+test-debian-ubsan:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-debian"]
+    variables:
+        UBSAN_OPTIONS: 'print_stacktrace=1:halt_on_error=1'
+        CC: clang
+    script:
+        - meson build --buildtype debugoptimized
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      -Db_sanitize=undefined
+                      -Db_lundef=false
+                      -Denable_asm=false
+        - ninja -C build
+        - cd build && time meson test -v --setup=sanitizer
+
+test-win64:
+    extends:
+        - .debian-amd64-common
+        - .test-common
+    needs: ["build-win64"]
+    tags:
+        - docker
+        - avx2
+        - amd64
+    script:
+        - wineserver -p && wine wineboot
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+                      --cross-file package/crossfiles/x86_64-w64-mingw32.meson
+        - ninja -C build
+        - cd build && time meson test -v
+
+test-debian-aarch64:
+    extends:
+        - .debian-aarch64-common
+        - .test-common
+    needs: ["build-debian-aarch64"]
+    script:
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+        - ninja -C build
+        - cd build && time meson test -v
+
+test-debian-ppc64le:
+    extends:
+        - .debian-ppc64le-common
+        - .test-common
+    needs: ["build-debian-ppc64le"]
+    script:
+        - meson build --buildtype release
+                      -Dtestdata_tests=true
+                      -Dlogging=false
+        - ninja -C build
+        - cd build && time meson test -v
+
+test-debian-armv7-clang-5:
+    extends:
+        - .debian-armv7-common
+        - .test-common
+    needs: ["build-debian-armv7-clang-5"]
+    variables:
+        CC: clang-5.0
+        CFLAGS: '-integrated-as'
+    script:
+        - linux32 meson build --buildtype release
+                              -Dtestdata_tests=true
+                              -Dlogging=false
+        - ninja -C build
+        - cd build && time meson test -v
+
+
+.pages-common:
+    extends: .debian-amd64-common
+    script:
+        - meson build --buildtype release
+                      --werror
+        - ninja -C build doc/html
+        - mv build/doc/html public
+    artifacts:
+        paths:
+            - public
+
+build-pages:
+    extends: .pages-common
+    except:
+        refs:
+            - master
+
+pages:
+    extends: .pages-common
+    only:
+        refs:
+            - master
+        changes:
+            - include/dav1d/*
diff --git a/ffmpeg/JNI/dav1d/NEWS b/ffmpeg/JNI/dav1d/NEWS
index 46695fd7e..1294dc52c 100644
--- a/ffmpeg/JNI/dav1d/NEWS
+++ b/ffmpeg/JNI/dav1d/NEWS
@@ -1,3 +1,33 @@
+Changes for 0.7.1 'Frigatebird':
+------------------------------
+
+0.7.1 is a minor update on 0.7.0:
+ - ARM32 NEON optimizations for itxfm, which can give up to 28% speedup, and MSAC
+ - SSE2 optimizations for prep_bilin and prep_8tap
+ - AVX2 optimizations for MC scaled
+ - Fix a clamping issue in motion vector projection
+ - Fix an issue on some specific Haswell CPU on ipred_z AVX2 functions
+ - Improvements on the dav1dplay utility player to support resizing
+
+
+Changes for 0.7.0 'Frigatebird':
+------------------------------
+
+0.7.0 is a major release for dav1d:
+ - Faster refmv implementation gaining up to 12% speed while -25% of RAM (Single Thread)
+ - 10b/12b ARM64 optimizations are mostly complete:
+   - ipred (paeth, smooth, dc, pal, filter, cfl)
+   - itxfm (only 10b)
+ - AVX2/SSSE3 for non-4:2:0 film grain and for mc.resize
+ - AVX2 for cfl4:4:4
+ - AVX-512 CDEF filter
+ - ARM64 8b improvements for cfl_ac and itxfm
+ - ARM64 implementation for emu_edge in 8b/10b/12b
+ - ARM32 implementation for emu_edge in 8b
+ - Improvements on the dav1dplay utility player to support 10 bit,
+   non-4:2:0 pixel formats and film grain on the GPU
+
+
 Changes for 0.6.0 'Gyrfalcon':
 ------------------------------
 
diff --git a/ffmpeg/JNI/dav1d/builddir/.ninja_deps b/ffmpeg/JNI/dav1d/builddir/.ninja_deps
deleted file mode 100644
index d6da49ae7..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/.ninja_deps and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/.ninja_log b/ffmpeg/JNI/dav1d/builddir/.ninja_log
deleted file mode 100644
index 5ca13a284..000000000
--- a/ffmpeg/JNI/dav1d/builddir/.ninja_log
+++ /dev/null
@@ -1,107 +0,0 @@
-# ninja log v5
-1	134	1584948901715376510	src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o	63af13494d15f9e
-1	199	1584948901779084976	src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o	57e06d2e8b8a986d
-1	212	1584948901773226508	include/vcs_version.h	720dab2031fdd723
-1	234	1584948901815039592	src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o	4ac5d155afbcb60a
-135	244	1584948901825083166	src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o	54afce5a6ebd5420
-234	307	1584948901888586245	src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o	2a411c45413fefeb
-244	320	1584948901901022470	src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o	6692e2267b8bbf1e
-212	379	1584948901960473837	src/25a6634@@dav1d_entrypoint@sta/lib.c.o	199037861e6ad90f
-199	493	1584948902074129009	src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o	fa6108b1cab13f0d
-379	498	1584948902078574214	src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o	c6905b7df8c0b378
-307	596	1584948902176437182	src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o	527c79441fca8395
-498	606	1584948902186764947	src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o	6e5791172d4a6d12
-493	703	1584948902284140586	src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o	506378dcde1797c7
-703	758	1584948902340028880	src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o	45b1bd1750ea59dc
-758	797	1584948902378937784	src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o	54bfa84920242e6d
-797	838	1584948902419225754	src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o	cb71905026629629
-838	874	1584948902456382121	src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o	fc7f31cdf6ac6866
-874	932	1584948902513985277	src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o	97f89106ddf15b2e
-932	968	1584948902550096862	src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o	a78d4f76c4cd24bf
-320	1068	1584948902647783370	src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o	8e06c30e33d45a74
-596	1161	1584948902741902452	src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o	c2e88fb1780dcb1b
-1161	1261	1584948902842725034	src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o	52071763b5fb7c8c
-606	1263	1584948902843761879	src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o	8620471446f1a24c
-1263	1341	1584948902922273122	src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o	c8ce7a0514dd6f4a
-1068	1367	1584948902948149672	src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o	dd63d26939d67b49
-1367	1472	1584948903053372842	src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o	8ea862ebcecc14a2
-1261	1485	1584948903064012494	src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o	2884ac8fec1268dc
-968	1584	1584948903163358420	src/25a6634@@dav1d@sta/cdf.c.o	9fa40041e89c2397
-1472	1650	1584948903231730373	src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o	741537d11aadd8f6
-1341	1669	1584948903248710277	src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o	41382fed1ab53767
-1584	1705	1584948903286615965	src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o	1f7e42678bb55bf0
-1705	1822	1584948903392510393	src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o	b74a0dcb55ddba0a
-1650	1859	1584948903440845056	src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o	f03ad3a8e9e5d87a
-1859	1894	1584948903476364274	src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o	b41dcd9bd9e1e2f5
-1894	1927	1584948903508419851	src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o	25a23a02ca5267eb
-1927	1960	1584948903542059401	src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o	6246bd8d9842a39d
-1960	1994	1584948903576063730	src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o	9ce4054292d62405
-1994	2029	1584948903610462476	src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o	23904c1926deefb4
-2029	2064	1584948903645956346	src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o	864659b4941d840d
-2064	2119	1584948903699972440	src/25a6634@@dav1d@sta/data.c.o	c97c3991b5c52447
-2119	2153	1584948903734869534	src/25a6634@@dav1d@sta/cpu.c.o	c319040c90560143
-2153	2184	1584948903766132440	src/25a6634@@dav1d@sta/dequant_tables.c.o	98b7f39779894c3f
-1485	2198	1584948903777209672	src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o	a49e54bdd88ad78f
-2198	2252	1584948903833839133	src/25a6634@@dav1d@sta/intra_edge.c.o	921abe41ade11d4a
-2252	2334	1584948903915674470	src/25a6634@@dav1d@sta/getbits.c.o	7edd9c77eb0463eb
-1669	2336	1584948903915988522	src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o	d1b9576ced0ee792
-2334	2393	1584948903973850148	src/25a6634@@dav1d@sta/log.c.o	ed43381c58dc7191
-1822	2432	1584948904012019086	src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o	d5764b68221e0015
-2393	2502	1584948904083057236	src/25a6634@@dav1d@sta/msac.c.o	683ef2041609e4f7
-2336	2544	1584948904125387224	src/25a6634@@dav1d@sta/lf_mask.c.o	6326820028ef07d9
-2502	2593	1584948904174567562	src/25a6634@@dav1d@sta/picture.c.o	8073857895928851
-2593	2638	1584948904220475362	src/25a6634@@dav1d@sta/ref.c.o	1357cb8fe319538d
-2544	2690	1584948904271216502	src/25a6634@@dav1d@sta/qm.c.o	3f6359d4c064cc1b
-2690	2726	1584948904307960763	src/25a6634@@dav1d@sta/scan.c.o	9dd2404340150d4d
-2726	2766	1584948904348759953	src/25a6634@@dav1d@sta/tables.c.o	5e5f4d508b28c29b
-2766	2841	1584948904422572982	src/25a6634@@dav1d@sta/warpmv.c.o	8fbe369b6cc02b72
-2432	2855	1584948904433868019	src/25a6634@@dav1d@sta/obu.c.o	532939c420dedc6e
-2841	2872	1584948904454410064	src/25a6634@@dav1d@sta/arm_cpu.c.o	1dda6294bf784f6a
-2872	2908	1584948904491245148	src/25a6634@@dav1d@sta/arm_64_ipred.S.o	1bb8ad968f2aa0b1
-2908	2946	1584948904529304426	src/25a6634@@dav1d@sta/arm_64_cdef.S.o	bc4f68ed4ef1fa4a
-2855	2955	1584948904536297638	src/25a6634@@dav1d@sta/wedge.c.o	f558f98ac1f399b5
-2946	2990	1584948904572818358	src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o	3a0ad77f72e591f
-2990	3036	1584948904618300703	src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o	6aeb6615068e3bbb
-2955	3048	1584948904630063973	src/25a6634@@dav1d@sta/arm_64_itx.S.o	21e1d3bc160da695
-3048	3083	1584948904665979461	src/25a6634@@dav1d@sta/arm_64_msac.S.o	353617e90a5feb52
-3036	3109	1584948904691642850	src/25a6634@@dav1d@sta/arm_64_mc.S.o	12bdcabecee75a2a
-3083	3138	1584948904719926233	tools/f9d35d4@@dav1d_output@sta/output_output.c.o	f24d74ba2e61804c
-3109	3160	1584948904742150171	tools/f9d35d4@@dav1d_input@sta/input_input.c.o	63fef28132c33045
-3160	3216	1584948904798285359	tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o	c961479845bef020
-3138	3223	1584948904804592066	tools/f9d35d4@@dav1d@exe/dav1d.c.o	e663372975514dc9
-2638	3243	1584948904823108461	src/25a6634@@dav1d@sta/ref_mvs.c.o	7cf9413db12b89e2
-3216	3273	1584948904854510220	tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o	17146dd873f61bd2
-3243	3281	1584948904863250787	tools/f9d35d4@@dav1d_output@sta/output_null.c.o	799e46c7044fbba7
-3273	3285	1584948904869336372	tools/libdav1d_input.a	d1d19e74400c63af
-3223	3321	1584948904902946406	tools/f9d35d4@@dav1d_output@sta/output_md5.c.o	78839ae8fbbb6080
-3285	3335	1584948904916510195	tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o	18111b437e7df2e3
-3281	3336	1584948904918189031	tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o	437fb9d279e9395d
-3336	3349	1584948904932959283	tools/libdav1d_output.a	bfd1e8ba347863dc
-3321	3408	1584948904990211551	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o	c14bf3f2fed757f3
-3335	3424	1584948905005005629	tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o	a6b5260fbee7f276
-3424	3539	1584948905120383885	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o	74622b2bd3d21aa1
-3349	3559	1584948905139162384	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o	5f77c640558b74f7
-3409	3571	1584948905151863556	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o	bd69fbe58c0cfb15
-3571	3659	1584948905240602812	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o	c9faeb5401644bda
-3559	3672	1584948905252122144	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o	84b5a3753976f436
-3539	3712	1584948905293499168	tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o	aa739c03a8364c81
-3659	3747	1584948905328092322	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o	d48f538430ebe5c6
-3712	3865	1584948905445168732	tests/59830eb@@checkasm@exe/checkasm_msac.c.o	c3b04750215fba44
-3672	3874	1584948905453727244	tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o	43e69bc880cd888e
-3747	3903	1584948905483961705	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o	b7e1b02b93437599
-2184	3912	1584948905490730646	src/25a6634@@dav1d@sta/decode.c.o	7c51ea4d4642b07e
-3912	3937	1584948905520897311	src/libdav1d.a	70adcc6d3ba339f8
-3937	3969	1584948905552837065	tools/dav1d	840267a2e6502530
-3865	3987	1584948905568148256	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o	38c12806557f32a5
-3903	4027	1584948905608624537	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o	b6e2f5cdd75acbc2
-3874	4059	1584948905639930259	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o	883b1b14b9f32683
-3969	4068	1584948905649021650	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o	a3100295f88374d3
-4027	4080	1584948905661374067	tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o	5e90bb020839ec94
-4059	4093	1584948905676363368	tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o	21398b174c915c0e
-4068	4112	1584948905694249629	tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o	ef706d11756679b
-4080	4130	1584948905711752152	tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o	53bf4f318fdf96c6
-4093	4136	1584948905718431021	tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o	4338f5f1f147ab3f
-4130	4162	1584948905745600609	tests/dav1d_fuzzer_mt	e068c7811100cda3
-4136	4168	1584948905751526156	tests/dav1d_fuzzer	8bf38da49ef16c7c
-3987	4202	1584948905783451792	tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o	5d4ac59be93c6b36
-4202	4245	1584948905828566274	tests/checkasm	b72edfee09127526
diff --git a/ffmpeg/JNI/dav1d/builddir/build.ninja b/ffmpeg/JNI/dav1d/builddir/build.ninja
deleted file mode 100644
index 4fb0b2e19..000000000
--- a/ffmpeg/JNI/dav1d/builddir/build.ninja
+++ /dev/null
@@ -1,575 +0,0 @@
-# This is the build file for project "dav1d"
-# It is autogenerated by the Meson build system.
-# Do not edit by hand.
-
-ninja_required_version = 1.5.1
-
-# Rules for compiling.
-
-rule c_COMPILER
- command = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang $ARGS -MD -MQ $out -MF '$DEPFILE' -o $out -c $in
- deps = gcc
- depfile = $DEPFILE
- description = Compiling C object $out.
-
-# Rules for linking.
-
-rule STATIC_LINKER
- command = rm -f $out && /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-ar $LINK_ARGS $out $in
- description = Linking static target $out.
-
-rule c_LINKER
- command = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang $ARGS -o $out $in $LINK_ARGS
- description = Linking target $out.
-
-# Other rules
-
-rule CUSTOM_COMMAND
- command = $COMMAND
- description = $DESC
- restat = 1
-
-rule REGENERATE_BUILD
- command = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal regenerate /Users/zlin/workspace/mxcore/media_player/jni/dav1d /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir --backend ninja
- description = Regenerating build files.
- generator = 1
-
-# Phony build target, always out of date
-
-build PHONY: phony 
-
-# Build rules for targets
-
-build include/vcs_version.h: CUSTOM_COMMAND ../include/vcs_version.h.in | PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal vcstagger ../include/vcs_version.h.in include/vcs_version.h 0.4.0 /Users/zlin/workspace/mxcore/media_player/jni/dav1d/include @VCS_TAG@ '(.*)' /Library/Developer/CommandLineTools/usr/bin/git --git-dir /Users/zlin/workspace/mxcore/media_player/jni/dav1d/.git describe --tags --long --match '?.*.*' --always
- description = Generating$ vcs_version.h$ with$ a$ custom$ command.
-
-build src/25a6634@@dav1d_entrypoint@sta/lib.c.o: c_COMPILER ../src/lib.c || include/vcs_version.h
- DEPFILE = src/25a6634@@dav1d_entrypoint@sta/lib.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o: c_COMPILER ../src/thread_task.c || include/vcs_version.h
- DEPFILE = src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build src/libdav1d_entrypoint.a: STATIC_LINKER src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o
- LINK_ARGS = csrD
-
-build src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o: c_COMPILER ../src/cdef_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o: c_COMPILER ../src/cdef_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o: c_COMPILER ../src/fg_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o: c_COMPILER ../src/film_grain_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o: c_COMPILER ../src/ipred_prepare_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o: c_COMPILER ../src/ipred_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o: c_COMPILER ../src/itx_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o: c_COMPILER ../src/lf_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o: c_COMPILER ../src/loopfilter_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o: c_COMPILER ../src/looprestoration_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o: c_COMPILER ../src/lr_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o: c_COMPILER ../src/mc_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o: c_COMPILER ../src/recon_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o: c_COMPILER ../src/arm/cdef_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o: c_COMPILER ../src/arm/ipred_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o: c_COMPILER ../src/arm/itx_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o: c_COMPILER ../src/arm/loopfilter_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o: c_COMPILER ../src/arm/looprestoration_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o: c_COMPILER ../src/arm/mc_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build src/libdav1d_bitdepth_8.a: STATIC_LINKER src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o
- LINK_ARGS = csrD
-
-build src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o: c_COMPILER ../src/cdef_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o: c_COMPILER ../src/cdef_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o: c_COMPILER ../src/fg_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o: c_COMPILER ../src/film_grain_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o: c_COMPILER ../src/ipred_prepare_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o: c_COMPILER ../src/ipred_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o: c_COMPILER ../src/itx_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o: c_COMPILER ../src/lf_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o: c_COMPILER ../src/loopfilter_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o: c_COMPILER ../src/looprestoration_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o: c_COMPILER ../src/lr_apply_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o: c_COMPILER ../src/mc_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o: c_COMPILER ../src/recon_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o: c_COMPILER ../src/arm/cdef_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o: c_COMPILER ../src/arm/ipred_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o: c_COMPILER ../src/arm/itx_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o: c_COMPILER ../src/arm/loopfilter_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o: c_COMPILER ../src/arm/looprestoration_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o: c_COMPILER ../src/arm/mc_init_tmpl.c
- DEPFILE = src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o.d
- ARGS = -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build src/libdav1d_bitdepth_16.a: STATIC_LINKER src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o
- LINK_ARGS = csrD
-
-build src/libdav1d_arch_bitdepth_8.a: STATIC_LINKER 
- LINK_ARGS = csrD
-
-build src/libdav1d_arch_bitdepth_16.a: STATIC_LINKER 
- LINK_ARGS = csrD
-
-build src/25a6634@@dav1d@sta/cdf.c.o: c_COMPILER ../src/cdf.c
- DEPFILE = src/25a6634@@dav1d@sta/cdf.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/cpu.c.o: c_COMPILER ../src/cpu.c
- DEPFILE = src/25a6634@@dav1d@sta/cpu.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/data.c.o: c_COMPILER ../src/data.c
- DEPFILE = src/25a6634@@dav1d@sta/data.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/decode.c.o: c_COMPILER ../src/decode.c
- DEPFILE = src/25a6634@@dav1d@sta/decode.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/dequant_tables.c.o: c_COMPILER ../src/dequant_tables.c
- DEPFILE = src/25a6634@@dav1d@sta/dequant_tables.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/getbits.c.o: c_COMPILER ../src/getbits.c
- DEPFILE = src/25a6634@@dav1d@sta/getbits.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/intra_edge.c.o: c_COMPILER ../src/intra_edge.c
- DEPFILE = src/25a6634@@dav1d@sta/intra_edge.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/lf_mask.c.o: c_COMPILER ../src/lf_mask.c
- DEPFILE = src/25a6634@@dav1d@sta/lf_mask.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/log.c.o: c_COMPILER ../src/log.c
- DEPFILE = src/25a6634@@dav1d@sta/log.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/msac.c.o: c_COMPILER ../src/msac.c
- DEPFILE = src/25a6634@@dav1d@sta/msac.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/obu.c.o: c_COMPILER ../src/obu.c
- DEPFILE = src/25a6634@@dav1d@sta/obu.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/picture.c.o: c_COMPILER ../src/picture.c
- DEPFILE = src/25a6634@@dav1d@sta/picture.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/qm.c.o: c_COMPILER ../src/qm.c
- DEPFILE = src/25a6634@@dav1d@sta/qm.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/ref.c.o: c_COMPILER ../src/ref.c
- DEPFILE = src/25a6634@@dav1d@sta/ref.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/ref_mvs.c.o: c_COMPILER ../src/ref_mvs.c
- DEPFILE = src/25a6634@@dav1d@sta/ref_mvs.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/scan.c.o: c_COMPILER ../src/scan.c
- DEPFILE = src/25a6634@@dav1d@sta/scan.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/tables.c.o: c_COMPILER ../src/tables.c
- DEPFILE = src/25a6634@@dav1d@sta/tables.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/warpmv.c.o: c_COMPILER ../src/warpmv.c
- DEPFILE = src/25a6634@@dav1d@sta/warpmv.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/wedge.c.o: c_COMPILER ../src/wedge.c
- DEPFILE = src/25a6634@@dav1d@sta/wedge.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_cpu.c.o: c_COMPILER ../src/arm/cpu.c
- DEPFILE = src/25a6634@@dav1d@sta/arm_cpu.c.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_cdef.S.o: c_COMPILER ../src/arm/64/cdef.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_cdef.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_ipred.S.o: c_COMPILER ../src/arm/64/ipred.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_ipred.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_itx.S.o: c_COMPILER ../src/arm/64/itx.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_itx.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o: c_COMPILER ../src/arm/64/loopfilter.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o: c_COMPILER ../src/arm/64/looprestoration.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_mc.S.o: c_COMPILER ../src/arm/64/mc.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_mc.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/25a6634@@dav1d@sta/arm_64_msac.S.o: c_COMPILER ../src/arm/64/msac.S
- DEPFILE = src/25a6634@@dav1d@sta/arm_64_msac.S.o.d
- ARGS = -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread
-
-build src/libdav1d.a: STATIC_LINKER src/25a6634@@dav1d@sta/cdf.c.o src/25a6634@@dav1d@sta/cpu.c.o src/25a6634@@dav1d@sta/data.c.o src/25a6634@@dav1d@sta/decode.c.o src/25a6634@@dav1d@sta/dequant_tables.c.o src/25a6634@@dav1d@sta/getbits.c.o src/25a6634@@dav1d@sta/intra_edge.c.o src/25a6634@@dav1d@sta/lf_mask.c.o src/25a6634@@dav1d@sta/log.c.o src/25a6634@@dav1d@sta/msac.c.o src/25a6634@@dav1d@sta/obu.c.o src/25a6634@@dav1d@sta/picture.c.o src/25a6634@@dav1d@sta/qm.c.o src/25a6634@@dav1d@sta/ref.c.o src/25a6634@@dav1d@sta/ref_mvs.c.o src/25a6634@@dav1d@sta/scan.c.o src/25a6634@@dav1d@sta/tables.c.o src/25a6634@@dav1d@sta/warpmv.c.o src/25a6634@@dav1d@sta/wedge.c.o src/25a6634@@dav1d@sta/arm_cpu.c.o src/25a6634@@dav1d@sta/arm_64_cdef.S.o src/25a6634@@dav1d@sta/arm_64_ipred.S.o src/25a6634@@dav1d@sta/arm_64_itx.S.o src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o src/25a6634@@dav1d@sta/arm_64_mc.S.o src/25a6634@@dav1d@sta/arm_64_msac.S.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o
- LINK_ARGS = csrD
-
-build tools/f9d35d4@@dav1d_input@sta/input_input.c.o: c_COMPILER ../tools/input/input.c
- DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_input.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o: c_COMPILER ../tools/input/annexb.c
- DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o: c_COMPILER ../tools/input/ivf.c
- DEPFILE = tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/libdav1d_input.a: STATIC_LINKER tools/f9d35d4@@dav1d_input@sta/input_input.c.o tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o
- LINK_ARGS = csrD
-
-build tools/f9d35d4@@dav1d_output@sta/output_md5.c.o: c_COMPILER ../tools/output/md5.c
- DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_md5.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_output@sta/output_null.c.o: c_COMPILER ../tools/output/null.c
- DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_null.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_output@sta/output_output.c.o: c_COMPILER ../tools/output/output.c
- DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_output.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o: c_COMPILER ../tools/output/y4m2.c
- DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o: c_COMPILER ../tools/output/yuv.c
- DEPFILE = tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC
-
-build tools/libdav1d_output.a: STATIC_LINKER tools/f9d35d4@@dav1d_output@sta/output_md5.c.o tools/f9d35d4@@dav1d_output@sta/output_null.c.o tools/f9d35d4@@dav1d_output@sta/output_output.c.o tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o
- LINK_ARGS = csrD
-
-build tools/f9d35d4@@dav1d@exe/dav1d.c.o: c_COMPILER ../tools/dav1d.c || include/vcs_version.h
- DEPFILE = tools/f9d35d4@@dav1d@exe/dav1d.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o: c_COMPILER ../tools/dav1d_cli_parse.c || include/vcs_version.h
- DEPFILE = tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o.d
- ARGS = -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tools/dav1d: c_LINKER tools/f9d35d4@@dav1d@exe/dav1d.c.o tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o | src/libdav1d.a tools/libdav1d_input.a tools/libdav1d_output.a
- LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a tools/libdav1d_input.a tools/libdav1d_output.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src:$$ORIGIN/' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o: c_COMPILER ../tests/checkasm/cdef.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o: c_COMPILER ../tests/checkasm/filmgrain.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o: c_COMPILER ../tests/checkasm/ipred.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o: c_COMPILER ../tests/checkasm/itx.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o: c_COMPILER ../tests/checkasm/loopfilter.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o: c_COMPILER ../tests/checkasm/looprestoration.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o: c_COMPILER ../tests/checkasm/mc.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8
-
-build tests/libcheckasm_bitdepth_8.a: STATIC_LINKER tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o
- LINK_ARGS = csrD
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o: c_COMPILER ../tests/checkasm/cdef.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o: c_COMPILER ../tests/checkasm/filmgrain.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o: c_COMPILER ../tests/checkasm/ipred.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o: c_COMPILER ../tests/checkasm/itx.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o: c_COMPILER ../tests/checkasm/loopfilter.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o: c_COMPILER ../tests/checkasm/looprestoration.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o: c_COMPILER ../tests/checkasm/mc.c
- DEPFILE = tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o.d
- ARGS = -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16
-
-build tests/libcheckasm_bitdepth_16.a: STATIC_LINKER tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o
- LINK_ARGS = csrD
-
-build tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o: c_COMPILER ../tests/checkasm/checkasm.c
- DEPFILE = tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o.d
- ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tests/59830eb@@checkasm@exe/checkasm_msac.c.o: c_COMPILER ../tests/checkasm/msac.c
- DEPFILE = tests/59830eb@@checkasm@exe/checkasm_msac.c.o.d
- ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o: c_COMPILER ../tests/checkasm/arm/checkasm_64.S
- DEPFILE = tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o.d
- ARGS = -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tests/checkasm: c_LINKER tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o tests/59830eb@@checkasm@exe/checkasm_msac.c.o tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o src/25a6634@@dav1d_entrypoint@sta/lib.c.o src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o src/25a6634@@dav1d@sta/cdf.c.o src/25a6634@@dav1d@sta/cpu.c.o src/25a6634@@dav1d@sta/data.c.o src/25a6634@@dav1d@sta/decode.c.o src/25a6634@@dav1d@sta/dequant_tables.c.o src/25a6634@@dav1d@sta/getbits.c.o src/25a6634@@dav1d@sta/intra_edge.c.o src/25a6634@@dav1d@sta/lf_mask.c.o src/25a6634@@dav1d@sta/log.c.o src/25a6634@@dav1d@sta/msac.c.o src/25a6634@@dav1d@sta/obu.c.o src/25a6634@@dav1d@sta/picture.c.o src/25a6634@@dav1d@sta/qm.c.o src/25a6634@@dav1d@sta/ref.c.o src/25a6634@@dav1d@sta/ref_mvs.c.o src/25a6634@@dav1d@sta/scan.c.o src/25a6634@@dav1d@sta/tables.c.o src/25a6634@@dav1d@sta/warpmv.c.o src/25a6634@@dav1d@sta/wedge.c.o src/25a6634@@dav1d@sta/arm_cpu.c.o src/25a6634@@dav1d@sta/arm_64_cdef.S.o src/25a6634@@dav1d@sta/arm_64_ipred.S.o src/25a6634@@dav1d@sta/arm_64_itx.S.o src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o src/25a6634@@dav1d@sta/arm_64_mc.S.o src/25a6634@@dav1d@sta/arm_64_msac.S.o | /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot/usr/lib/aarch64-linux-android/21/libm.a /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot/usr/lib/aarch64-linux-android/21/libm.so
- LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -pthread -Wl,--start-group -lm -Wl,--end-group
-
-build tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread
-
-build tests/dav1d_fuzzer: c_LINKER tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o | src/libdav1d.a
- LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src
-
-build tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING
-
-build tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING
-
-build tests/dav1d_fuzzer_mt: c_LINKER tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o | src/libdav1d.a
- LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group src/libdav1d.a -Wl,--end-group -pthread '-Wl,-rpath,$$ORIGIN/../src' -Wl,-rpath-link,/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src
-
-build tests/libdav1d_af.a: CUSTOM_COMMAND src/libdav1d.a | /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy src/libdav1d.a
- COMMAND = /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy --redefine-sym malloc=__wrap_malloc --redefine-sym posix_memalign=__wrap_posix_memalign --redefine-sym pthread_create=__wrap_pthread_create --redefine-sym pthread_cond_init=__wrap_pthread_cond_init --redefine-sym pthread_mutex_init=__wrap_pthread_mutex_init src/libdav1d.a tests/libdav1d_af.a
- description = Generating$ libdav1d_af$ with$ a$ custom$ command.
-
-build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o: c_COMPILER ../tests/libfuzzer/dav1d_fuzzer.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL
-
-build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o: c_COMPILER ../tests/libfuzzer/main.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL
-
-build tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o: c_COMPILER ../tests/libfuzzer/alloc_fail.c
- DEPFILE = tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o.d
- ARGS = -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL
-
-build tests/dav1d_fuzzer_mem: c_LINKER tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o | tests/libdav1d_af.a
- LINK_ARGS = -Wl,--as-needed -Wl,--no-undefined -Wl,-O1 -pie -O2 -march=armv8-a -Wl,--start-group /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libdav1d_af.a -Wl,--end-group -pthread
-
-# Test rules
-
-build meson-test: CUSTOM_COMMAND all PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson test --no-rebuild --print-errorlogs
- DESC = Running$ all$ tests.
- pool = console
-
-build test: phony meson-test
-
-build meson-benchmark: CUSTOM_COMMAND all PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson test --benchmark --logbase benchmarklog --num-processes=1 --no-rebuild
- DESC = Running$ benchmark$ suite.
- pool = console
-
-build benchmark: phony meson-benchmark
-
-# Install rules
-
-build meson-install: CUSTOM_COMMAND PHONY | all
- DESC = Installing$ files.
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson install --no-rebuild
- pool = console
-
-build install: phony meson-install
-
-build meson-dist: CUSTOM_COMMAND PHONY
- DESC = Creating$ source$ packages
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson dist
- pool = console
-
-build dist: phony meson-dist
-
-# Suffix
-
-build meson-TAGS: CUSTOM_COMMAND PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal tags etags /Users/zlin/workspace/mxcore/media_player/jni/dav1d
- pool = console
-
-build TAGS: phony meson-TAGS
-
-build meson-ctags: CUSTOM_COMMAND PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal tags ctags /Users/zlin/workspace/mxcore/media_player/jni/dav1d
- pool = console
-
-build ctags: phony meson-ctags
-
-build meson-uninstall: CUSTOM_COMMAND PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal uninstall
- pool = console
-
-build uninstall: phony meson-uninstall
-
-build all: phony include/vcs_version.h src/libdav1d.a tools/dav1d tests/dav1d_fuzzer tests/dav1d_fuzzer_mt tests/checkasm
-
-build clean: phony meson-clean
-
-build meson-clean-ctlist: CUSTOM_COMMAND PHONY
- COMMAND = /Library/Frameworks/Python.framework/Versions/3.8/bin/meson --internal cleantrees /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/cleantrees.dat
- description = Cleaning$ custom$ target$ directories.
-
-build clean-ctlist: phony meson-clean-ctlist
-
-build meson-clean: CUSTOM_COMMAND PHONY | clean-ctlist
- COMMAND = /Users/zlin/workspace/ninja/ninja -t clean
- description = Cleaning.
-
-build build.ninja: REGENERATE_BUILD ../meson.build ../include/meson.build ../include/dav1d/meson.build ../include/dav1d/version.h.in ../doc/meson.build ../src/meson.build ../tools/meson.build ../examples/meson.build ../tests/meson.build /Users/zlin/workspace/mxcore/media_player/jni/dav1d/cross_file.txt meson-private/coredata.dat ../meson_options.txt
- pool = console
-
-build reconfigure: REGENERATE_BUILD PHONY
- pool = console
-
-build ../meson.build ../include/meson.build ../include/dav1d/meson.build ../include/dav1d/version.h.in ../doc/meson.build ../src/meson.build ../tools/meson.build ../examples/meson.build ../tests/meson.build /Users/zlin/workspace/mxcore/media_player/jni/dav1d/cross_file.txt meson-private/coredata.dat ../meson_options.txt: phony 
-
-default all
-
diff --git a/ffmpeg/JNI/dav1d/builddir/compile_commands.json b/ffmpeg/JNI/dav1d/builddir/compile_commands.json
deleted file mode 100644
index 92b2b9bf8..000000000
--- a/ffmpeg/JNI/dav1d/builddir/compile_commands.json
+++ /dev/null
@@ -1,608 +0,0 @@
-[
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o' -MF 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o.d' -o 'src/25a6634@@dav1d_entrypoint@sta/lib.c.o' -c ../src/lib.c",
-    "file": "../src/lib.c",
-    "output": "src/25a6634@@dav1d_entrypoint@sta/lib.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_entrypoint@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o' -MF 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o.d' -o 'src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o' -c ../src/thread_task.c",
-    "file": "../src/thread_task.c",
-    "output": "src/25a6634@@dav1d_entrypoint@sta/thread_task.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o' -c ../src/cdef_apply_tmpl.c",
-    "file": "../src/cdef_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/cdef_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o' -c ../src/cdef_tmpl.c",
-    "file": "../src/cdef_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/cdef_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o' -c ../src/fg_apply_tmpl.c",
-    "file": "../src/fg_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/fg_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o' -c ../src/film_grain_tmpl.c",
-    "file": "../src/film_grain_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/film_grain_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o' -c ../src/ipred_prepare_tmpl.c",
-    "file": "../src/ipred_prepare_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/ipred_prepare_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o' -c ../src/ipred_tmpl.c",
-    "file": "../src/ipred_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/ipred_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o' -c ../src/itx_tmpl.c",
-    "file": "../src/itx_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/itx_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o' -c ../src/lf_apply_tmpl.c",
-    "file": "../src/lf_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/lf_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o' -c ../src/loopfilter_tmpl.c",
-    "file": "../src/loopfilter_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/loopfilter_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o' -c ../src/looprestoration_tmpl.c",
-    "file": "../src/looprestoration_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/looprestoration_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o' -c ../src/lr_apply_tmpl.c",
-    "file": "../src/lr_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/lr_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o' -c ../src/mc_tmpl.c",
-    "file": "../src/mc_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/mc_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o' -c ../src/recon_tmpl.c",
-    "file": "../src/recon_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/recon_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o' -c ../src/arm/cdef_init_tmpl.c",
-    "file": "../src/arm/cdef_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_cdef_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o' -c ../src/arm/ipred_init_tmpl.c",
-    "file": "../src/arm/ipred_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_ipred_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o' -c ../src/arm/itx_init_tmpl.c",
-    "file": "../src/arm/itx_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_itx_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o' -c ../src/arm/loopfilter_init_tmpl.c",
-    "file": "../src/arm/loopfilter_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_loopfilter_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o' -c ../src/arm/looprestoration_init_tmpl.c",
-    "file": "../src/arm/looprestoration_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_looprestoration_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_8@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o' -c ../src/arm/mc_init_tmpl.c",
-    "file": "../src/arm/mc_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_8@sta/arm_mc_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o' -c ../src/cdef_apply_tmpl.c",
-    "file": "../src/cdef_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/cdef_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o' -c ../src/cdef_tmpl.c",
-    "file": "../src/cdef_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/cdef_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o' -c ../src/fg_apply_tmpl.c",
-    "file": "../src/fg_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/fg_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o' -c ../src/film_grain_tmpl.c",
-    "file": "../src/film_grain_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/film_grain_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o' -c ../src/ipred_prepare_tmpl.c",
-    "file": "../src/ipred_prepare_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/ipred_prepare_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o' -c ../src/ipred_tmpl.c",
-    "file": "../src/ipred_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/ipred_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o' -c ../src/itx_tmpl.c",
-    "file": "../src/itx_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/itx_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o' -c ../src/lf_apply_tmpl.c",
-    "file": "../src/lf_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/lf_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o' -c ../src/loopfilter_tmpl.c",
-    "file": "../src/loopfilter_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/loopfilter_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o' -c ../src/looprestoration_tmpl.c",
-    "file": "../src/looprestoration_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/looprestoration_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o' -c ../src/lr_apply_tmpl.c",
-    "file": "../src/lr_apply_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/lr_apply_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o' -c ../src/mc_tmpl.c",
-    "file": "../src/mc_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/mc_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o' -c ../src/recon_tmpl.c",
-    "file": "../src/recon_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/recon_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o' -c ../src/arm/cdef_init_tmpl.c",
-    "file": "../src/arm/cdef_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_cdef_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o' -c ../src/arm/ipred_init_tmpl.c",
-    "file": "../src/arm/ipred_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_ipred_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o' -c ../src/arm/itx_init_tmpl.c",
-    "file": "../src/arm/itx_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_itx_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o' -c ../src/arm/loopfilter_init_tmpl.c",
-    "file": "../src/arm/loopfilter_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_loopfilter_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o' -c ../src/arm/looprestoration_init_tmpl.c",
-    "file": "../src/arm/looprestoration_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_looprestoration_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d_bitdepth_16@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o' -MF 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o.d' -o 'src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o' -c ../src/arm/mc_init_tmpl.c",
-    "file": "../src/arm/mc_init_tmpl.c",
-    "output": "src/25a6634@@dav1d_bitdepth_16@sta/arm_mc_init_tmpl.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/cdf.c.o' -MF 'src/25a6634@@dav1d@sta/cdf.c.o.d' -o 'src/25a6634@@dav1d@sta/cdf.c.o' -c ../src/cdf.c",
-    "file": "../src/cdf.c",
-    "output": "src/25a6634@@dav1d@sta/cdf.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/cpu.c.o' -MF 'src/25a6634@@dav1d@sta/cpu.c.o.d' -o 'src/25a6634@@dav1d@sta/cpu.c.o' -c ../src/cpu.c",
-    "file": "../src/cpu.c",
-    "output": "src/25a6634@@dav1d@sta/cpu.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/data.c.o' -MF 'src/25a6634@@dav1d@sta/data.c.o.d' -o 'src/25a6634@@dav1d@sta/data.c.o' -c ../src/data.c",
-    "file": "../src/data.c",
-    "output": "src/25a6634@@dav1d@sta/data.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/decode.c.o' -MF 'src/25a6634@@dav1d@sta/decode.c.o.d' -o 'src/25a6634@@dav1d@sta/decode.c.o' -c ../src/decode.c",
-    "file": "../src/decode.c",
-    "output": "src/25a6634@@dav1d@sta/decode.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/dequant_tables.c.o' -MF 'src/25a6634@@dav1d@sta/dequant_tables.c.o.d' -o 'src/25a6634@@dav1d@sta/dequant_tables.c.o' -c ../src/dequant_tables.c",
-    "file": "../src/dequant_tables.c",
-    "output": "src/25a6634@@dav1d@sta/dequant_tables.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/getbits.c.o' -MF 'src/25a6634@@dav1d@sta/getbits.c.o.d' -o 'src/25a6634@@dav1d@sta/getbits.c.o' -c ../src/getbits.c",
-    "file": "../src/getbits.c",
-    "output": "src/25a6634@@dav1d@sta/getbits.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/intra_edge.c.o' -MF 'src/25a6634@@dav1d@sta/intra_edge.c.o.d' -o 'src/25a6634@@dav1d@sta/intra_edge.c.o' -c ../src/intra_edge.c",
-    "file": "../src/intra_edge.c",
-    "output": "src/25a6634@@dav1d@sta/intra_edge.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/lf_mask.c.o' -MF 'src/25a6634@@dav1d@sta/lf_mask.c.o.d' -o 'src/25a6634@@dav1d@sta/lf_mask.c.o' -c ../src/lf_mask.c",
-    "file": "../src/lf_mask.c",
-    "output": "src/25a6634@@dav1d@sta/lf_mask.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/log.c.o' -MF 'src/25a6634@@dav1d@sta/log.c.o.d' -o 'src/25a6634@@dav1d@sta/log.c.o' -c ../src/log.c",
-    "file": "../src/log.c",
-    "output": "src/25a6634@@dav1d@sta/log.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/msac.c.o' -MF 'src/25a6634@@dav1d@sta/msac.c.o.d' -o 'src/25a6634@@dav1d@sta/msac.c.o' -c ../src/msac.c",
-    "file": "../src/msac.c",
-    "output": "src/25a6634@@dav1d@sta/msac.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/obu.c.o' -MF 'src/25a6634@@dav1d@sta/obu.c.o.d' -o 'src/25a6634@@dav1d@sta/obu.c.o' -c ../src/obu.c",
-    "file": "../src/obu.c",
-    "output": "src/25a6634@@dav1d@sta/obu.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/picture.c.o' -MF 'src/25a6634@@dav1d@sta/picture.c.o.d' -o 'src/25a6634@@dav1d@sta/picture.c.o' -c ../src/picture.c",
-    "file": "../src/picture.c",
-    "output": "src/25a6634@@dav1d@sta/picture.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/qm.c.o' -MF 'src/25a6634@@dav1d@sta/qm.c.o.d' -o 'src/25a6634@@dav1d@sta/qm.c.o' -c ../src/qm.c",
-    "file": "../src/qm.c",
-    "output": "src/25a6634@@dav1d@sta/qm.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/ref.c.o' -MF 'src/25a6634@@dav1d@sta/ref.c.o.d' -o 'src/25a6634@@dav1d@sta/ref.c.o' -c ../src/ref.c",
-    "file": "../src/ref.c",
-    "output": "src/25a6634@@dav1d@sta/ref.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/ref_mvs.c.o' -MF 'src/25a6634@@dav1d@sta/ref_mvs.c.o.d' -o 'src/25a6634@@dav1d@sta/ref_mvs.c.o' -c ../src/ref_mvs.c",
-    "file": "../src/ref_mvs.c",
-    "output": "src/25a6634@@dav1d@sta/ref_mvs.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/scan.c.o' -MF 'src/25a6634@@dav1d@sta/scan.c.o.d' -o 'src/25a6634@@dav1d@sta/scan.c.o' -c ../src/scan.c",
-    "file": "../src/scan.c",
-    "output": "src/25a6634@@dav1d@sta/scan.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/tables.c.o' -MF 'src/25a6634@@dav1d@sta/tables.c.o.d' -o 'src/25a6634@@dav1d@sta/tables.c.o' -c ../src/tables.c",
-    "file": "../src/tables.c",
-    "output": "src/25a6634@@dav1d@sta/tables.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/warpmv.c.o' -MF 'src/25a6634@@dav1d@sta/warpmv.c.o.d' -o 'src/25a6634@@dav1d@sta/warpmv.c.o' -c ../src/warpmv.c",
-    "file": "../src/warpmv.c",
-    "output": "src/25a6634@@dav1d@sta/warpmv.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/wedge.c.o' -MF 'src/25a6634@@dav1d@sta/wedge.c.o.d' -o 'src/25a6634@@dav1d@sta/wedge.c.o' -c ../src/wedge.c",
-    "file": "../src/wedge.c",
-    "output": "src/25a6634@@dav1d@sta/wedge.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_cpu.c.o' -MF 'src/25a6634@@dav1d@sta/arm_cpu.c.o.d' -o 'src/25a6634@@dav1d@sta/arm_cpu.c.o' -c ../src/arm/cpu.c",
-    "file": "../src/arm/cpu.c",
-    "output": "src/25a6634@@dav1d@sta/arm_cpu.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_cdef.S.o' -c ../src/arm/64/cdef.S",
-    "file": "../src/arm/64/cdef.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_cdef.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_ipred.S.o' -c ../src/arm/64/ipred.S",
-    "file": "../src/arm/64/ipred.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_ipred.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_itx.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_itx.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_itx.S.o' -c ../src/arm/64/itx.S",
-    "file": "../src/arm/64/itx.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_itx.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o' -c ../src/arm/64/loopfilter.S",
-    "file": "../src/arm/64/loopfilter.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_loopfilter.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o' -c ../src/arm/64/looprestoration.S",
-    "file": "../src/arm/64/looprestoration.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_looprestoration.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_mc.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_mc.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_mc.S.o' -c ../src/arm/64/mc.S",
-    "file": "../src/arm/64/mc.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_mc.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Isrc/25a6634@@dav1d@sta -Isrc -I../src -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -pthread -MD -MQ 'src/25a6634@@dav1d@sta/arm_64_msac.S.o' -MF 'src/25a6634@@dav1d@sta/arm_64_msac.S.o.d' -o 'src/25a6634@@dav1d@sta/arm_64_msac.S.o' -c ../src/arm/64/msac.S",
-    "file": "../src/arm/64/msac.S",
-    "output": "src/25a6634@@dav1d@sta/arm_64_msac.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_input.c.o' -c ../tools/input/input.c",
-    "file": "../tools/input/input.c",
-    "output": "tools/f9d35d4@@dav1d_input@sta/input_input.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o' -c ../tools/input/annexb.c",
-    "file": "../tools/input/annexb.c",
-    "output": "tools/f9d35d4@@dav1d_input@sta/input_annexb.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_input@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o' -MF 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o.d' -o 'tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o' -c ../tools/input/ivf.c",
-    "file": "../tools/input/ivf.c",
-    "output": "tools/f9d35d4@@dav1d_input@sta/input_ivf.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_md5.c.o' -c ../tools/output/md5.c",
-    "file": "../tools/output/md5.c",
-    "output": "tools/f9d35d4@@dav1d_output@sta/output_md5.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_null.c.o' -c ../tools/output/null.c",
-    "file": "../tools/output/null.c",
-    "output": "tools/f9d35d4@@dav1d_output@sta/output_null.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_output.c.o' -c ../tools/output/output.c",
-    "file": "../tools/output/output.c",
-    "output": "tools/f9d35d4@@dav1d_output@sta/output_output.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o' -c ../tools/output/y4m2.c",
-    "file": "../tools/output/y4m2.c",
-    "output": "tools/f9d35d4@@dav1d_output@sta/output_y4m2.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d_output@sta -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -MD -MQ 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o' -MF 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o.d' -o 'tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o' -c ../tools/output/yuv.c",
-    "file": "../tools/output/yuv.c",
-    "output": "tools/f9d35d4@@dav1d_output@sta/output_yuv.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tools/f9d35d4@@dav1d@exe/dav1d.c.o' -MF 'tools/f9d35d4@@dav1d@exe/dav1d.c.o.d' -o 'tools/f9d35d4@@dav1d@exe/dav1d.c.o' -c ../tools/dav1d.c",
-    "file": "../tools/dav1d.c",
-    "output": "tools/f9d35d4@@dav1d@exe/dav1d.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itools/f9d35d4@@dav1d@exe -Itools -I../tools -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o' -MF 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o.d' -o 'tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o' -c ../tools/dav1d_cli_parse.c",
-    "file": "../tools/dav1d_cli_parse.c",
-    "output": "tools/f9d35d4@@dav1d@exe/dav1d_cli_parse.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o' -c ../tests/checkasm/cdef.c",
-    "file": "../tests/checkasm/cdef.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_cdef.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o' -c ../tests/checkasm/filmgrain.c",
-    "file": "../tests/checkasm/filmgrain.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_filmgrain.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o' -c ../tests/checkasm/ipred.c",
-    "file": "../tests/checkasm/ipred.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_ipred.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o' -c ../tests/checkasm/itx.c",
-    "file": "../tests/checkasm/itx.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_itx.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o' -c ../tests/checkasm/loopfilter.c",
-    "file": "../tests/checkasm/loopfilter.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_loopfilter.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o' -c ../tests/checkasm/looprestoration.c",
-    "file": "../tests/checkasm/looprestoration.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_looprestoration.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_8@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=8 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o' -c ../tests/checkasm/mc.c",
-    "file": "../tests/checkasm/mc.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_8@sta/checkasm_mc.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o' -c ../tests/checkasm/cdef.c",
-    "file": "../tests/checkasm/cdef.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_cdef.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o' -c ../tests/checkasm/filmgrain.c",
-    "file": "../tests/checkasm/filmgrain.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_filmgrain.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o' -c ../tests/checkasm/ipred.c",
-    "file": "../tests/checkasm/ipred.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_ipred.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o' -c ../tests/checkasm/itx.c",
-    "file": "../tests/checkasm/itx.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_itx.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o' -c ../tests/checkasm/loopfilter.c",
-    "file": "../tests/checkasm/loopfilter.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_loopfilter.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o' -c ../tests/checkasm/looprestoration.c",
-    "file": "../tests/checkasm/looprestoration.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_looprestoration.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm_bitdepth_16@sta -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIC -DBITDEPTH=16 -MD -MQ 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o' -MF 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o.d' -o 'tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o' -c ../tests/checkasm/mc.c",
-    "file": "../tests/checkasm/mc.c",
-    "output": "tests/59830eb@@checkasm_bitdepth_16@sta/checkasm_mc.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o' -c ../tests/checkasm/checkasm.c",
-    "file": "../tests/checkasm/checkasm.c",
-    "output": "tests/59830eb@@checkasm@exe/checkasm_checkasm.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_msac.c.o' -c ../tests/checkasm/msac.c",
-    "file": "../tests/checkasm/msac.c",
-    "output": "tests/59830eb@@checkasm@exe/checkasm_msac.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@checkasm@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o' -MF 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o.d' -o 'tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o' -c ../tests/checkasm/arm/checkasm_64.S",
-    "file": "../tests/checkasm/arm/checkasm_64.S",
-    "output": "tests/59830eb@@checkasm@exe/checkasm_arm_checkasm_64.S.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c",
-    "file": "../tests/libfuzzer/dav1d_fuzzer.c",
-    "output": "tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_dav1d_fuzzer.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -MD -MQ 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c",
-    "file": "../tests/libfuzzer/main.c",
-    "output": "tests/59830eb@@dav1d_fuzzer@exe/libfuzzer_main.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c",
-    "file": "../tests/libfuzzer/dav1d_fuzzer.c",
-    "output": "tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_dav1d_fuzzer.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mt@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_MT_FUZZING -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c",
-    "file": "../tests/libfuzzer/main.c",
-    "output": "tests/59830eb@@dav1d_fuzzer_mt@exe/libfuzzer_main.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o' -c ../tests/libfuzzer/dav1d_fuzzer.c",
-    "file": "../tests/libfuzzer/dav1d_fuzzer.c",
-    "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_dav1d_fuzzer.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o' -c ../tests/libfuzzer/main.c",
-    "file": "../tests/libfuzzer/main.c",
-    "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_main.c.o"
-  },
-  {
-    "directory": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir",
-    "command": "/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang -Itests/59830eb@@dav1d_fuzzer_mem@exe -Itests -I../tests -I. -I../ -Iinclude/dav1d -I../include/dav1d -Iinclude -I../include -Xclang -fcolor-diagnostics -DNDEBUG -pipe -D_FILE_OFFSET_BITS=64 -Wall -Winvalid-pch -Wextra -std=c99 -O3 -D_POSIX_C_SOURCE=200112L -fvisibility=hidden -Wundef -Werror=vla -Wno-missing-field-initializers -Wno-unused-parameter -Werror=missing-prototypes -Wshorten-64-to-32 -fomit-frame-pointer -ffast-math -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -fPIE -pthread -DDAV1D_ALLOC_FAIL -MD -MQ 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o' -MF 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o.d' -o 'tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o' -c ../tests/libfuzzer/alloc_fail.c",
-    "file": "../tests/libfuzzer/alloc_fail.c",
-    "output": "tests/59830eb@@dav1d_fuzzer_mem@exe/libfuzzer_alloc_fail.c.o"
-  }
-]
diff --git a/ffmpeg/JNI/dav1d/builddir/config.h b/ffmpeg/JNI/dav1d/builddir/config.h
deleted file mode 100644
index 07f70ca96..000000000
--- a/ffmpeg/JNI/dav1d/builddir/config.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Autogenerated by the Meson build system.
- * Do not edit, your changes will be lost.
- */
-
-#pragma once
-
-#define ARCH_AARCH64 1
-
-#define ARCH_ARM 0
-
-#define ARCH_PPC64LE 0
-
-#define ARCH_X86 0
-
-#define ARCH_X86_32 0
-
-#define ARCH_X86_64 0
-
-#define CONFIG_16BPC 1
-
-#define CONFIG_8BPC 1
-
-#define CONFIG_LOG 1
-
-#define ENDIANNESS_BIG 0
-
-#define HAVE_ASM 1
-
-#define HAVE_AS_FUNC 0
-
-#define HAVE_GETAUXVAL 1
-
-#define HAVE_POSIX_MEMALIGN 1
-
-#define HAVE_UNISTD_H 1
-
-#define PIC 3
-
diff --git a/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h b/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h
deleted file mode 100644
index cafd1f1a4..000000000
--- a/ffmpeg/JNI/dav1d/builddir/include/vcs_version.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* auto-generated, do not edit */
-#define DAV1D_VERSION "0.4.0"
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json
deleted file mode 100644
index 0637a088a..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-benchmarks.json
+++ /dev/null
@@ -1 +0,0 @@
-[]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json
deleted file mode 100644
index d4794fd4e..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildoptions.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"name": "auto_features", "value": "auto", "section": "core", "machine": "any", "choices": ["enabled", "disabled", "auto"], "type": "combo", "description": "Override value of all 'auto' features"}, {"name": "backend", "value": "ninja", "section": "core", "machine": "any", "choices": ["ninja", "vs", "vs2010", "vs2015", "vs2017", "vs2019", "xcode"], "type": "combo", "description": "Backend to use"}, {"name": "buildtype", "value": "release", "section": "core", "machine": "any", "choices": ["plain", "debug", "debugoptimized", "release", "minsize", "custom"], "type": "combo", "description": "Build type to use"}, {"name": "debug", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Debug"}, {"name": "default_library", "value": "static", "section": "core", "machine": "any", "choices": ["shared", "static", "both"], "type": "combo", "description": "Default library type"}, {"name": "install_umask", "value": 18, "section": "core", "machine": "any", "type": "integer", "description": "Default umask to apply on permissions of installed files"}, {"name": "layout", "value": "mirror", "section": "core", "machine": "any", "choices": ["mirror", "flat"], "type": "combo", "description": "Build directory layout"}, {"name": "optimization", "value": "3", "section": "core", "machine": "any", "choices": ["0", "g", "1", "2", "3", "s"], "type": "combo", "description": "Optimization level"}, {"name": "strip", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Strip targets on install"}, {"name": "unity", "value": "off", "section": "core", "machine": "any", "choices": ["on", "off", "subprojects"], "type": "combo", "description": "Unity build"}, {"name": "warning_level", "value": "2", "section": "core", "machine": "any", "choices": ["0", "1", "2", "3"], "type": "combo", "description": "Compiler warning level to use"}, {"name": "werror", "value": false, "section": "core", "machine": "any", "type": "boolean", "description": "Treat warnings as errors"}, {"name": "wrap_mode", "value": "default", "section": "core", "machine": "any", "choices": ["default", "nofallback", "nodownload", "forcefallback"], "type": "combo", "description": "Wrap mode"}, {"name": "cmake_prefix_path", "value": [], "section": "core", "machine": "host", "type": "array", "description": "List of additional prefixes for cmake to search"}, {"name": "pkg_config_path", "value": [], "section": "core", "machine": "host", "type": "array", "description": "List of additional paths for pkg-config to search"}, {"name": "build.cmake_prefix_path", "value": [], "section": "core", "machine": "build", "type": "array", "description": "List of additional prefixes for cmake to search"}, {"name": "build.pkg_config_path", "value": [], "section": "core", "machine": "build", "type": "array", "description": "List of additional paths for pkg-config to search"}, {"name": "backend_max_links", "value": 0, "section": "backend", "machine": "any", "type": "integer", "description": "Maximum number of linker processes to run or 0 for no limit"}, {"name": "b_asneeded", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use -Wl,--as-needed when linking"}, {"name": "b_bitcode", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Generate and embed bitcode (only macOS/iOS/tvOS)"}, {"name": "b_colorout", "value": "always", "section": "base", "machine": "any", "choices": ["auto", "always", "never"], "type": "combo", "description": "Use colored output"}, {"name": "b_coverage", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Enable coverage tracking."}, {"name": "b_lto", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Use link time optimization"}, {"name": "b_lundef", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use -Wl,--no-undefined when linking"}, {"name": "b_ndebug", "value": "if-release", "section": "base", "machine": "any", "choices": ["true", "false", "if-release"], "type": "combo", "description": "Disable asserts"}, {"name": "b_pch", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Use precompiled headers"}, {"name": "b_pgo", "value": "off", "section": "base", "machine": "any", "choices": ["off", "generate", "use"], "type": "combo", "description": "Use profile guided optimization"}, {"name": "b_pie", "value": false, "section": "base", "machine": "any", "type": "boolean", "description": "Build executables as position independent"}, {"name": "b_sanitize", "value": "none", "section": "base", "machine": "any", "choices": ["none", "address", "thread", "undefined", "memory", "address,undefined"], "type": "combo", "description": "Code sanitizer to use"}, {"name": "b_staticpic", "value": true, "section": "base", "machine": "any", "type": "boolean", "description": "Build static libraries as position independent"}, {"name": "c_args", "value": ["-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function"], "section": "compiler", "machine": "host", "type": "array", "description": "Extra arguments passed to the C compiler"}, {"name": "c_link_args", "value": ["-O2", "-march=armv8-a"], "section": "compiler", "machine": "host", "type": "array", "description": "Extra arguments passed to the C linker"}, {"name": "c_std", "value": "c99", "section": "compiler", "machine": "host", "choices": ["none", "c89", "c99", "c11", "c17", "c18", "gnu89", "gnu99", "gnu11", "gnu17", "gnu18"], "type": "combo", "description": "C language standard to use"}, {"name": "build.c_args", "value": [], "section": "compiler", "machine": "build", "type": "array", "description": "Extra arguments passed to the C compiler"}, {"name": "build.c_link_args", "value": [], "section": "compiler", "machine": "build", "type": "array", "description": "Extra arguments passed to the C linker"}, {"name": "build.c_std", "value": "none", "section": "compiler", "machine": "build", "choices": ["none", "c89", "c99", "c11", "c17", "gnu89", "gnu99", "gnu11", "gnu17"], "type": "combo", "description": "C language standard to use"}, {"name": "bindir", "value": "bin", "section": "directory", "machine": "any", "type": "string", "description": "Executable directory"}, {"name": "datadir", "value": "share", "section": "directory", "machine": "any", "type": "string", "description": "Data file directory"}, {"name": "includedir", "value": "include", "section": "directory", "machine": "any", "type": "string", "description": "Header file directory"}, {"name": "infodir", "value": "share/info", "section": "directory", "machine": "any", "type": "string", "description": "Info page directory"}, {"name": "libdir", "value": "lib", "section": "directory", "machine": "any", "type": "string", "description": "Library directory"}, {"name": "libexecdir", "value": "libexec", "section": "directory", "machine": "any", "type": "string", "description": "Library executable directory"}, {"name": "localedir", "value": "share/locale", "section": "directory", "machine": "any", "type": "string", "description": "Locale data directory"}, {"name": "localstatedir", "value": "var", "section": "directory", "machine": "any", "type": "string", "description": "Localstate data directory"}, {"name": "mandir", "value": "share/man", "section": "directory", "machine": "any", "type": "string", "description": "Manual page directory"}, {"name": "prefix", "value": "/usr/local", "section": "directory", "machine": "any", "type": "string", "description": "Installation prefix"}, {"name": "sbindir", "value": "sbin", "section": "directory", "machine": "any", "type": "string", "description": "System executable directory"}, {"name": "sharedstatedir", "value": "com", "section": "directory", "machine": "any", "type": "string", "description": "Architecture-independent data directory"}, {"name": "sysconfdir", "value": "etc", "section": "directory", "machine": "any", "type": "string", "description": "Sysconf data directory"}, {"name": "bitdepths", "value": ["8", "16"], "section": "user", "machine": "any", "type": "array", "description": "Enable only specified bitdepths"}, {"name": "enable_asm", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build asm files, if available"}, {"name": "enable_examples", "value": false, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d examples"}, {"name": "enable_tests", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d tests"}, {"name": "enable_tools", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Build dav1d cli tools"}, {"name": "fuzzer_ldflags", "value": "", "section": "user", "machine": "any", "type": "string", "description": "Extra LDFLAGS used during linking of fuzzing binaries"}, {"name": "fuzzing_engine", "value": "none", "section": "user", "machine": "any", "choices": ["none", "libfuzzer", "oss-fuzz"], "type": "combo", "description": "Select the fuzzing engine"}, {"name": "logging", "value": true, "section": "user", "machine": "any", "type": "boolean", "description": "Print error log messages using the provided callback function"}, {"name": "stack_alignment", "value": 0, "section": "user", "machine": "any", "type": "integer", "description": "stack_alignment"}, {"name": "testdata_tests", "value": false, "section": "user", "machine": "any", "type": "boolean", "description": "Run tests requiring the test data repository"}, {"name": "errorlogs", "value": true, "section": "test", "machine": "any", "type": "boolean", "description": "Whether to print the logs from failing tests"}, {"name": "stdsplit", "value": true, "section": "test", "machine": "any", "type": "boolean", "description": "Split stdout and stderr in test logs"}]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json
deleted file mode 100644
index 88c4bacbb..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-buildsystem_files.json
+++ /dev/null
@@ -1 +0,0 @@
-["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/meson_options.txt", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/examples/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/doc/meson.build", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build"]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json
deleted file mode 100644
index 9c1e6b55a..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-dependencies.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"name": "threads", "compile_args": ["-pthread"], "link_args": ["-pthread"]}]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json
deleted file mode 100644
index bd1580bb8..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-installed.json
+++ /dev/null
@@ -1 +0,0 @@
-{"/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d.a": "/usr/local/lib/libdav1d.a", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/dav1d": "/usr/local/bin/dav1d", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/dav1d.pc": "/usr/local/lib/pkgconfig/dav1d.pc", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/common.h": "/usr/local/include/dav1d/common.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/data.h": "/usr/local/include/dav1d/data.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/dav1d.h": "/usr/local/include/dav1d/dav1d.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/headers.h": "/usr/local/include/dav1d/headers.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d/picture.h": "/usr/local/include/dav1d/picture.h", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d/version.h": "/usr/local/include/dav1d/version.h"}
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json
deleted file mode 100644
index 77d5b5642..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-projectinfo.json
+++ /dev/null
@@ -1 +0,0 @@
-{"version": "0.4.0", "descriptive_name": "dav1d", "subproject_dir": "subprojects", "subprojects": []}
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json
deleted file mode 100644
index f4efb192a..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-targets.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"name": "vcs_version.h", "id": "c0cbff0@@vcs_version.h@cus", "type": "custom", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/vcs_version.h"], "build_by_default": true, "target_sources": [{"language": "unknown", "compiler": ["/Library/Frameworks/Python.framework/Versions/3.8/bin/meson", "--internal", "vcstagger", "@INPUT0@", "@OUTPUT0@", "0.4.0", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "@VCS_TAG@", "(.*)", "/Library/Developer/CommandLineTools/usr/bin/git", "--git-dir", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/.git", "describe", "--tags", "--long", "--match", "?.*.*", "--always"], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/vcs_version.h.in"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_entrypoint", "id": "25a6634@@dav1d_entrypoint@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_entrypoint.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_entrypoint@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lib.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/thread_task.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_bitdepth_8", "id": "25a6634@@dav1d_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_bitdepth_8@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=8"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/fg_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/film_grain_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_prepare_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/itx_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/loopfilter_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/looprestoration_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lr_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/mc_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/recon_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cdef_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/ipred_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/itx_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/loopfilter_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/looprestoration_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/mc_init_tmpl.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_bitdepth_16", "id": "25a6634@@dav1d_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d_bitdepth_16@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=16"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdef_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/fg_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/film_grain_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_prepare_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ipred_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/itx_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/loopfilter_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/looprestoration_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lr_apply_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/mc_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/recon_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cdef_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/ipred_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/itx_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/loopfilter_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/looprestoration_init_tmpl.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/mc_init_tmpl.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_arch_bitdepth_8", "id": "25a6634@@dav1d_arch_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_arch_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": [], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/config.h"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_arch_bitdepth_16", "id": "25a6634@@dav1d_arch_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d_arch_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": [], "parameters": [], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/config.h"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d", "id": "25a6634@@dav1d@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/libdav1d.a"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src/25a6634@@dav1d@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cdf.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/cpu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/data.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/decode.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/dequant_tables.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/getbits.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/intra_edge.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/lf_mask.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/log.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/msac.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/obu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/picture.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/qm.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ref.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/ref_mvs.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/scan.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/tables.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/warpmv.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/wedge.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/cpu.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/cdef.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/ipred.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/itx.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/loopfilter.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/looprestoration.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/mc.S", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/src/arm/64/msac.S"], "generated_sources": []}], "subproject": null, "installed": true, "install_filename": ["/usr/local/lib/libdav1d.a"]}, {"name": "dav1d_input", "id": "f9d35d4@@dav1d_input@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/libdav1d_input.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d_input@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/input.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/annexb.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/input/ivf.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_output", "id": "f9d35d4@@dav1d_output@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/libdav1d_output.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d_output@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/md5.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/null.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/output.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/y4m2.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/output/yuv.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d", "id": "f9d35d4@@dav1d@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/dav1d"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools/f9d35d4@@dav1d@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/dav1d.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tools/dav1d_cli_parse.c"], "generated_sources": []}], "subproject": null, "installed": true, "install_filename": ["/usr/local/bin/dav1d"]}, {"name": "checkasm_bitdepth_8", "id": "59830eb@@checkasm_bitdepth_8@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libcheckasm_bitdepth_8.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm_bitdepth_8@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=8"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/cdef.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/filmgrain.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/ipred.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/itx.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/loopfilter.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/looprestoration.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/mc.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "checkasm_bitdepth_16", "id": "59830eb@@checkasm_bitdepth_16@sta", "type": "static library", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libcheckasm_bitdepth_16.a"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm_bitdepth_16@sta", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIC", "-DBITDEPTH=16"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/cdef.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/filmgrain.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/ipred.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/itx.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/loopfilter.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/looprestoration.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/mc.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "checkasm", "id": "59830eb@@checkasm@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/checkasm"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@checkasm@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/checkasm.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/msac.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/checkasm/arm/checkasm_64.S"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer", "id": "59830eb@@dav1d_fuzzer@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer_mt", "id": "59830eb@@dav1d_fuzzer_mt@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer_mt"], "build_by_default": true, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer_mt@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread", "-DDAV1D_MT_FUZZING"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c"], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "libdav1d_af", "id": "59830eb@@libdav1d_af@cus", "type": "custom", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/libdav1d_af.a"], "build_by_default": false, "target_sources": [{"language": "unknown", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy", "--redefine-sym", "malloc=__wrap_malloc", "--redefine-sym", "posix_memalign=__wrap_posix_memalign", "--redefine-sym", "pthread_create=__wrap_pthread_create", "--redefine-sym", "pthread_cond_init=__wrap_pthread_cond_init", "--redefine-sym", "pthread_mutex_init=__wrap_pthread_mutex_init", "@INPUT@", "@OUTPUT@"], "parameters": [], "sources": [], "generated_sources": []}], "subproject": null, "installed": false}, {"name": "dav1d_fuzzer_mem", "id": "59830eb@@dav1d_fuzzer_mem@exe", "type": "executable", "defined_in": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/meson.build", "filename": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/dav1d_fuzzer_mem"], "build_by_default": false, "target_sources": [{"language": "c", "compiler": ["/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang"], "parameters": ["-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/59830eb@@dav1d_fuzzer_mem@exe", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include/dav1d", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/include", "-I/Users/zlin/workspace/mxcore/media_player/jni/dav1d/include", "-Xclang", "-fcolor-diagnostics", "-DNDEBUG", "-pipe", "-D_FILE_OFFSET_BITS=64", "-Wall", "-Winvalid-pch", "-Wextra", "-std=c99", "-O3", "-D_POSIX_C_SOURCE=200112L", "-fvisibility=hidden", "-Wundef", "-Werror=vla", "-Wno-missing-field-initializers", "-Wno-unused-parameter", "-Werror=missing-prototypes", "-Wshorten-64-to-32", "-fomit-frame-pointer", "-ffast-math", "-fstack-protector", "-fstrict-aliasing", "-Wno-deprecated-declarations", "-Wno-unused-variable", "-Wno-unused-function", "-fPIE", "-pthread", "-DDAV1D_ALLOC_FAIL"], "sources": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/dav1d_fuzzer.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/main.c", "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/tests/libfuzzer/alloc_fail.c"], "generated_sources": []}], "subproject": null, "installed": false}]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json b/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json
deleted file mode 100644
index d2b35d2fa..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/intro-tests.json
+++ /dev/null
@@ -1 +0,0 @@
-[{"cmd": ["/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/tests/checkasm"], "env": {}, "name": "checkasm", "workdir": null, "timeout": 30, "suite": ["dav1d"], "is_parallel": false, "priority": 0}]
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json b/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json
deleted file mode 100644
index a66097271..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-info/meson-info.json
+++ /dev/null
@@ -1 +0,0 @@
-{"meson_version": {"full": "0.52.999", "major": 0, "minor": 52, "patch": 999}, "directories": {"source": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d", "build": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir", "info": "/Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-info"}, "introspection": {"version": {"full": "1.0.0", "major": 1, "minor": 0, "patch": 0}, "information": {"benchmarks": {"file": "intro-benchmarks.json", "updated": true}, "buildoptions": {"file": "intro-buildoptions.json", "updated": true}, "buildsystem_files": {"file": "intro-buildsystem_files.json", "updated": true}, "dependencies": {"file": "intro-dependencies.json", "updated": true}, "installed": {"file": "intro-installed.json", "updated": true}, "projectinfo": {"file": "intro-projectinfo.json", "updated": true}, "targets": {"file": "intro-targets.json", "updated": true}, "tests": {"file": "intro-tests.json", "updated": true}}}, "build_files_updated": true, "error": false}
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt b/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt
deleted file mode 100644
index 4b5a3428c..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-logs/meson-log.txt
+++ /dev/null
@@ -1,456 +0,0 @@
-Build started at 2020-03-23T15:35:00.168028
-Main binary: /Library/Frameworks/Python.framework/Versions/3.8/bin/python3
-Build Options: -Ddefault_library=static '--cross-file cross_file.txt'
-Python system: Darwin
-The Meson build system
-Version: 0.52.999
-Source dir: /Users/zlin/workspace/mxcore/media_player/jni/dav1d
-Build dir: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir
-Build type: cross build
-Project name: dav1d
-Project version: 0.4.0
-No CFLAGS in the environment, not changing global flags.
-No LDFLAGS in the environment, not changing global flags.
-No CPPFLAGS in the environment, not changing global flags.
-Sanity testing C compiler: cc
-Is cross compiler: False.
-Sanity check compiler command line: cc /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.exe -pipe
-Sanity check compile stdout:
-
------
-Sanity check compile stderr:
-
------
-Running test binary command: /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.exe
-C compiler for the build machine: cc (clang 10.0.1 "Apple LLVM version 10.0.1 (clang-1001.0.46.4)")
-C linker for the build machine: APPLE ld 450.3
-Sanity testing C compiler: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang
-Is cross compiler: True.
-Sanity check compiler command line: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/sanitycheckc_cross.exe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -pipe -D_FILE_OFFSET_BITS=64 -c
-Sanity check compile stdout:
-
------
-Sanity check compile stderr:
-
------
-C compiler for the host machine: /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang (clang 8.0.7 "Android (5220042 based on r346389c) clang version 8.0.7 (https://android.googlesource.com/toolchain/clang b55f2d4ebfd35bf643d27dbca1bb228957008617) (https://android.googlesource.com/toolchain/llvm 3c393fe7a7e13b0fba4ac75a01aa683d7a5b11cd) (based on LLVM 8.0.7svn)")
-C linker for the host machine: GNU ld.bfd 2.27.0.20170315
-Build machine cpu family: x86_64
-Build machine cpu: x86_64
-Host machine cpu family: aarch64
-Host machine cpu: armv8-a
-Target machine cpu family: aarch64
-Target machine cpu: armv8-a
-Run-time dependency threads found: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprufufw_i/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 
-
-Code:
- 
-        #include <stdatomic.h>
-Compiler stdout:
- 
-Compiler stderr:
- 
-Check usable header "stdatomic.h" : YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpxcnv6_zq/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 
-
-Code:
- 
-        #include <unistd.h>
-Compiler stdout:
- 
-Compiler stderr:
- 
-Check usable header "unistd.h" : YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 
-
-Code:
- 
-        #include <io.h>
-Compiler stdout:
- 
-Compiler stderr:
- /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp48omqtt0/testfile.c:2:18: fatal error: 'io.h' file not found
-        #include <io.h>
-                 ^~~~~~
-1 error generated.
-
-Check usable header "io.h" : NO 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2d5m5mp2/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <getopt.h>
-#include <limits.h>
-
-        #if defined __stub_getopt_long || defined __stub___getopt_long
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &getopt_long;
-            long b = (long) a;
-            return (int) b;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- 
-Checking for function "getopt_long" : YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <stdlib.h>
-#include <limits.h>
-
-        #if defined __stub_posix_memalign || defined __stub___posix_memalign
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &posix_memalign;
-            long b = (long) a;
-            return (int) b;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- 
-Checking for function "posix_memalign" : YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkggrff7i/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <sys/auxv.h>
-#include <limits.h>
-
-        #if defined __stub_getauxval || defined __stub___getauxval
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &getauxval;
-            long b = (long) a;
-            return (int) b;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- 
-Checking for function "getauxval" : YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <sys/auxv.h>
-#include <limits.h>
-
-        #if defined __stub_elf_aux_info || defined __stub___elf_aux_info
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &elf_aux_info;
-            long b = (long) a;
-            return (int) b;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2o5yf6wc/testfile.c:9:32: error: use of undeclared identifier 'elf_aux_info'
-            void *a = (void*) &elf_aux_info;
-                               ^
-1 error generated.
-
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <sys/auxv.h>
-        int main() {
-        #ifdef __has_builtin
-            #if !__has_builtin(__builtin_elf_aux_info)
-                #error "__builtin_elf_aux_info not found"
-            #endif
-        #elif ! defined(elf_aux_info)
-            /* Check for __builtin_elf_aux_info only if no includes were added to the
-             * prefix above, which means no definition of elf_aux_info can be found.
-             * We would always check for this, but we get false positives on
-             * MSYS2 if we do. Their toolchain is broken, but we can at least
-             * give them a workaround. */
-            #if 0
-                __builtin_elf_aux_info;
-            #else
-                #error "No definition for __builtin_elf_aux_info found in the prefix"
-            #endif
-        #endif
-        return 0;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpzkegp55e/testfile.c:5:18: error: "__builtin_elf_aux_info not found"
-                #error "__builtin_elf_aux_info not found"
-                 ^
-1 error generated.
-
-Checking for function "elf_aux_info" : NO 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmprulkzab9/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -fvisibility=hidden 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -fvisibility=hidden: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpmdzrl4a8/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wundef 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Wundef: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpr4sgx2cq/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Werror=vla 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Werror=vla: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp2rmq3j9k/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-maybe-uninitialized -Wmaybe-uninitialized 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- error: unknown warning option '-Wno-maybe-uninitialized'; did you mean '-Wno-uninitialized'? [-Werror,-Wunknown-warning-option]
-error: unknown warning option '-Wmaybe-uninitialized'; did you mean '-Wuninitialized'? [-Werror,-Wunknown-warning-option]
-
-Compiler for C supports arguments -Wno-maybe-uninitialized: NO 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpfxlqgycz/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-missing-field-initializers -Wmissing-field-initializers 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Wno-missing-field-initializers: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpb8_6epq5/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wno-unused-parameter -Wunused-parameter 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Wno-unused-parameter: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpk46ovwwt/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Werror=missing-prototypes 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Werror=missing-prototypes: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpuw44p2eo/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -Wshorten-64-to-32 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -Wshorten-64-to-32: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp7sukpzez/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -fomit-frame-pointer 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -fomit-frame-pointer: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp1ru1ei4f/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Werror=unknown-warning-option -Werror=unused-command-line-argument -Werror=ignored-optimization-argument -ffast-math 
-
-Code:
- int i;
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Compiler for C supports arguments -ffast-math: YES 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmp4hyo7jwr/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 
-
-Code:
- __asm__ (
-".func meson_test"
-".endfunc"
-);
-
-Compiler stdout:
- 
-Compiler stderr:
- <inline asm>:1:1: error: unknown directive
-.func meson_test.endfunc
-^
-1 error generated.
-
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkzjeycwv/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 
-
-Code:
- 
-#if defined(PIC)
-#error "PIC already defined"
-#elif !(defined(__PIC__) || defined(__pic__))
-#error "no pic"
-#endif
-
-Compiler stdout:
- 
-Compiler stderr:
- 
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpicm365yp/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -O0 
-
-Code:
- #ifdef __cplusplus
-        extern "C" {
-        #endif
-        void meson_uscore_prefix () {}
-        #ifdef __cplusplus
-        }
-        #endif
-        
-Compiler stdout:
- 
-Compiler stderr:
- 
-Symbols have underscore prefix: NO
-Configuring config.h using configuration
-Configuring version.h using configuration
-Program doxygen found: NO
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpx54mpw0t/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <time.h>
-#include <limits.h>
-
-        #if defined __stub_clock_gettime || defined __stub___clock_gettime
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &clock_gettime;
-            long b = (long) a;
-            return (int) b;
-        }
-Compiler stdout:
- 
-Compiler stderr:
- 
-Checking for function "clock_gettime" : YES 
-Configuring cli_config.h using configuration
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpqp9f8eru/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -Wl,--start-group -lm -Wl,--end-group -Wl,--allow-shlib-undefined -O2 -march=armv8-a 
-
-Code:
- int main() { return 0; }
-Compiler stdout:
- 
-Compiler stderr:
- 
-Library m found: YES
-Adding test "checkasm"
-Using cached compile:
-Cached command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpkjff4dua/output.exe -pipe -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 -std=c99 -D_POSIX_C_SOURCE=200112L -O2 -march=armv8-a 
-
-Code:
- #include <stdlib.h>
-#include <limits.h>
-
-        #if defined __stub_posix_memalign || defined __stub___posix_memalign
-        fail fail fail this function is not going to work
-        #endif
-        
-int main() {
-            void *a = (void*) &posix_memalign;
-            long b = (long) a;
-            return (int) b;
-        }
-Cached compiler stdout:
- 
-Cached compiler stderr:
- 
-Checking for function "posix_memalign" : YES (cached)
-Build targets in project: 17
-Found ninja-1.9.0.git at /Users/zlin/workspace/ninja/ninja
-Running compile:
-Working directory:  /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol
-Command line:  /Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol/testfile.c -o /Users/zlin/workspace/mxcore/media_player/jni/dav1d/builddir/meson-private/tmpjdmh12ol/output.obj -pipe -c -fstack-protector -fstrict-aliasing -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function -D_FILE_OFFSET_BITS=64 -O0 --print-search-dirs 
-
-Code:
- 
-Compiler stdout:
- programs: =/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/bin
-libraries: =/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/lib64/clang/8.0.7:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/lib64/clang/8.0.7/lib/linux/aarch64:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/lib/../lib64:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib/aarch64-linux-android/21:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib/aarch64-linux-android:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../lib/gcc/aarch64-linux-android/4.9.x/../../../../aarch64-linux-android/lib:/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/../sysroot/usr/lib
-
-Compiler stderr:
- 
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat
deleted file mode 100644
index bacb3788e..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/build.dat and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat
deleted file mode 100644
index 7ee7736b4..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/cleantrees.dat and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt b/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt
deleted file mode 100644
index 061fe6d4e..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-private/cmd_line.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-[options]
-default_library = static
-
-[properties]
-cross_file = ['cross_file.txt']
-
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat
deleted file mode 100644
index 9dbbb6bc9..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/coredata.dat and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc b/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc
deleted file mode 100644
index 1e76bcefe..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-private/dav1d.pc
+++ /dev/null
@@ -1,9 +0,0 @@
-prefix=/usr/local
-libdir=${prefix}/lib
-includedir=${prefix}/include
-
-Name: libdav1d
-Description: AV1 decoding library
-Version: 0.4.0
-Libs: -L${libdir} -ldav1d -pthread
-Cflags: -I${includedir} -pthread
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat
deleted file mode 100644
index 9c6afee32..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/install.dat and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson.lock b/ffmpeg/JNI/dav1d/builddir/meson-private/meson.lock
deleted file mode 100644
index e69de29bb..000000000
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat
deleted file mode 100644
index 92c3c883e..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_benchmark_setup.dat
+++ /dev/null
@@ -1 +0,0 @@
-�]�.
\ No newline at end of file
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat b/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat
deleted file mode 100644
index f8cf8d627..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/meson_test_setup.dat and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c
deleted file mode 100644
index 0f968e8aa..000000000
--- a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.c
+++ /dev/null
@@ -1 +0,0 @@
-int main() { int class=0; return class; }
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe
deleted file mode 100755
index 2fcafd29f..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc.exe and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe b/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe
deleted file mode 100644
index e0c2aa1b4..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/meson-private/sanitycheckc_cross.exe and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a b/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a
deleted file mode 100644
index e7d8469be..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/src/libdav1d.a and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tests/checkasm b/ffmpeg/JNI/dav1d/builddir/tests/checkasm
deleted file mode 100755
index 5a699c54a..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tests/checkasm and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer b/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer
deleted file mode 100755
index 590c8143b..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt b/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt
deleted file mode 100755
index 59703d26a..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tests/dav1d_fuzzer_mt and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h b/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h
deleted file mode 100644
index 21660b629..000000000
--- a/ffmpeg/JNI/dav1d/builddir/tools/cli_config.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Autogenerated by the Meson build system.
- * Do not edit, your changes will be lost.
- */
-
-#pragma once
-
-#define HAVE_CLOCK_GETTIME 1
-
diff --git a/ffmpeg/JNI/dav1d/builddir/tools/dav1d b/ffmpeg/JNI/dav1d/builddir/tools/dav1d
deleted file mode 100755
index 35c051152..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tools/dav1d and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a b/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a
deleted file mode 100644
index d51ef9423..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_input.a and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a b/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a
deleted file mode 100644
index 590677be4..000000000
Binary files a/ffmpeg/JNI/dav1d/builddir/tools/libdav1d_output.a and /dev/null differ
diff --git a/ffmpeg/JNI/dav1d/cross_file.txt b/ffmpeg/JNI/dav1d/cross_file.txt
deleted file mode 100644
index 86f1c78f4..000000000
--- a/ffmpeg/JNI/dav1d/cross_file.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-[binaries]
-c = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android21-clang'
-ar = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-ar'
-objcopy = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-objcopy'
-strip = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/bin/aarch64-linux-android-strip'
-
-[properties]
-sys_root = '/Users/zlin/workspace/android-ndk-r20/toolchains/llvm/prebuilt/darwin-x86_64/sysroot'
-c_args = ['-fstack-protector','-fstrict-aliasing','-Wno-deprecated-declarations','-Wno-unused-variable','-Wno-unused-function']
-c_link_args =['-O2','-march=armv8-a']
-
-[host_machine]
-system = 'android'
-cpu_family = 'aarch64'
-cpu = 'armv8-a'
-endian = 'little'
diff --git a/ffmpeg/JNI/dav1d/doc/meson.build b/ffmpeg/JNI/dav1d/doc/meson.build
index 4badbf6ea..0ef712344 100644
--- a/ffmpeg/JNI/dav1d/doc/meson.build
+++ b/ffmpeg/JNI/dav1d/doc/meson.build
@@ -27,8 +27,8 @@ dot = find_program('dot', required: false)
 
 if doxygen.found() and dot.found()
     conf_data = configuration_data()
-    conf_data.set('DOXYGEN_INPUT', join_paths(meson.source_root(), 'include/dav1d'))
-    conf_data.set('DOXYGEN_STRIP', join_paths(meson.source_root(), 'include'))
+    conf_data.set('DOXYGEN_INPUT', join_paths(dav1d_src_root, 'include/dav1d'))
+    conf_data.set('DOXYGEN_STRIP', join_paths(dav1d_src_root, 'include'))
     conf_data.set('DOXYGEN_OUTPUT', meson.current_build_dir())
     doxyfile = configure_file(input: 'Doxyfile.in',
                               output: 'Doxyfile',
diff --git a/ffmpeg/JNI/dav1d/examples/dav1dplay.c b/ffmpeg/JNI/dav1d/examples/dav1dplay.c
index bcd4835b3..d6bb262b5 100644
--- a/ffmpeg/JNI/dav1d/examples/dav1dplay.c
+++ b/ffmpeg/JNI/dav1d/examples/dav1dplay.c
@@ -29,687 +29,18 @@
 
 #include <getopt.h>
 #include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 
 #include <SDL.h>
 
-#include "common/attributes.h"
-
 #include "dav1d/dav1d.h"
 
+#include "common/attributes.h"
 #include "tools/input/input.h"
+#include "dp_fifo.h"
+#include "dp_renderer.h"
 
-/**
- * Settings structure
- * Hold all settings available for the player,
- * this is usually filled by parsing arguments
- * from the console.
- */
-typedef struct {
-    const char *inputfile;
-    int highquality;
-    int untimed;
-    int zerocopy;
-} Dav1dPlaySettings;
-
-#define WINDOW_WIDTH  910
-#define WINDOW_HEIGHT 512
-
-#define DAV1D_EVENT_NEW_FRAME 1
-#define DAV1D_EVENT_DEC_QUIT  2
-
-/*
- * Fifo helper functions
- */
-typedef struct dp_fifo
-{
-    SDL_mutex *lock;
-    SDL_cond *cond_change;
-    size_t capacity;
-    size_t count;
-    void **entries;
-} Dav1dPlayPtrFifo;
-
-static void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
-{
-    assert(fifo->count == 0);
-    SDL_DestroyMutex(fifo->lock);
-    SDL_DestroyCond(fifo->cond_change);
-    free(fifo->entries);
-    free(fifo);
-}
-
-static Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
-{
-    Dav1dPlayPtrFifo *fifo;
-
-    assert(capacity > 0);
-    if (capacity <= 0)
-        return NULL;
-
-    fifo = malloc(sizeof(*fifo));
-    if (fifo == NULL)
-        return NULL;
-
-    fifo->capacity = capacity;
-    fifo->count = 0;
-
-    fifo->lock = SDL_CreateMutex();
-    if (fifo->lock == NULL) {
-        free(fifo);
-        return NULL;
-    }
-    fifo->cond_change = SDL_CreateCond();
-    if (fifo->cond_change == NULL) {
-        SDL_DestroyMutex(fifo->lock);
-        free(fifo);
-        return NULL;
-    }
-
-    fifo->entries = calloc(capacity, sizeof(void*));
-    if (fifo->entries == NULL) {
-        dp_fifo_destroy(fifo);
-        return NULL;
-    }
-
-    return fifo;
-}
-
-static void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
-{
-    SDL_LockMutex(fifo->lock);
-    while (fifo->count == fifo->capacity)
-        SDL_CondWait(fifo->cond_change, fifo->lock);
-    fifo->entries[fifo->count++] = element;
-    if (fifo->count == 1)
-        SDL_CondSignal(fifo->cond_change);
-    SDL_UnlockMutex(fifo->lock);
-}
-
-static void *dp_fifo_array_shift(void **arr, size_t len)
-{
-    void *shifted_element = arr[0];
-    for (size_t i = 1; i < len; ++i)
-        arr[i-1] = arr[i];
-    return shifted_element;
-}
-
-static void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
-{
-    SDL_LockMutex(fifo->lock);
-    while (fifo->count == 0)
-        SDL_CondWait(fifo->cond_change, fifo->lock);
-    void *res = dp_fifo_array_shift(fifo->entries, fifo->count--);
-    if (fifo->count == fifo->capacity - 1)
-        SDL_CondSignal(fifo->cond_change);
-    SDL_UnlockMutex(fifo->lock);
-    return res;
-}
-
-/**
- * Renderer info
- */
-typedef struct rdr_info
-{
-    // Cookie passed to the renderer implementation callbacks
-    void *cookie;
-    // Callback to create the renderer
-    void* (*create_renderer)(void *data);
-    // Callback to destroy the renderer
-    void (*destroy_renderer)(void *cookie);
-    // Callback to the render function that renders a prevously sent frame
-    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
-    // Callback to the send frame function
-    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
-                        const Dav1dPlaySettings *settings);
-    // Callback for alloc/release pictures (optional)
-    int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
-    void (*release_pic)(Dav1dPicture *pic, void *cookie);
-} Dav1dPlayRenderInfo;
-
-#ifdef HAVE_PLACEBO_VULKAN
-
-#include <libplacebo/renderer.h>
-#include <libplacebo/utils/upload.h>
-#include <libplacebo/vulkan.h>
-#include <SDL_vulkan.h>
-
-
-/**
- * Renderer context for libplacebo
- */
-typedef struct renderer_priv_ctx
-{
-    // Placebo context
-    struct pl_context *ctx;
-    // Placebo renderer
-    struct pl_renderer *renderer;
-    // Placebo Vulkan handle
-    const struct pl_vulkan *vk;
-    // Placebo Vulkan instance
-    const struct pl_vk_inst *vk_inst;
-    // Vulkan surface
-    VkSurfaceKHR surf;
-    // Placebo swapchain
-    const struct pl_swapchain *swapchain;
-    // Lock protecting access to the texture
-    SDL_mutex *lock;
-    // Planes to render
-    struct pl_plane y_plane;
-    struct pl_plane u_plane;
-    struct pl_plane v_plane;
-    // Textures to render
-    const struct pl_tex *y_tex;
-    const struct pl_tex *u_tex;
-    const struct pl_tex *v_tex;
-} Dav1dPlayRendererPrivateContext;
-
-static void *placebo_renderer_create(void *data)
-{
-    // Alloc
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
-    if (rd_priv_ctx == NULL) {
-        return NULL;
-    }
-
-    // Init libplacebo
-    rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
-        .log_cb     = pl_log_color,
-#ifndef NDEBUG
-        .log_level  = PL_LOG_DEBUG,
-#else
-        .log_level  = PL_LOG_WARN,
-#endif
-    });
-    if (rd_priv_ctx->ctx == NULL) {
-        free(rd_priv_ctx);
-        return NULL;
-    }
-
-    // Create Mutex
-    rd_priv_ctx->lock = SDL_CreateMutex();
-    if (rd_priv_ctx->lock == NULL) {
-        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
-        pl_context_destroy(&(rd_priv_ctx->ctx));
-        free(rd_priv_ctx);
-        return NULL;
-    }
-
-    // Init Vulkan
-    struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
-
-    SDL_Window *sdlwin = data;
-
-    unsigned num = 0;
-    if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) {
-        fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError());
-        exit(1);
-    }
-
-    iparams.extensions = malloc(num * sizeof(const char *));
-    iparams.num_extensions = num;
-    assert(iparams.extensions);
-
-    SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, iparams.extensions);
-    if (!ok) {
-        fprintf(stderr, "Failed getting Vk instance extensions\n");
-        exit(1);
-    }
-
-    if (num > 0) {
-        printf("Requesting %d additional Vulkan extensions:\n", num);
-        for (unsigned i = 0; i < num; i++)
-            printf("    %s\n", iparams.extensions[i]);
-    }
-
-    rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams);
-    if (!rd_priv_ctx->vk_inst) {
-        fprintf(stderr, "Failed creating Vulkan instance!\n");
-        exit(1);
-    }
-    free(iparams.extensions);
-
-    if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) {
-        fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError());
-        exit(1);
-    }
-
-    struct pl_vulkan_params params = pl_vulkan_default_params;
-    params.instance = rd_priv_ctx->vk_inst->instance;
-    params.surface = rd_priv_ctx->surf;
-    params.allow_software = true;
-
-    rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, &params);
-    if (!rd_priv_ctx->vk) {
-        fprintf(stderr, "Failed creating vulkan device!\n");
-        exit(2);
-    }
-
-    // Create swapchain
-    rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk,
-        &(struct pl_vulkan_swapchain_params) {
-            .surface = rd_priv_ctx->surf,
-            .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR,
-        });
-
-    if (!rd_priv_ctx->swapchain) {
-        fprintf(stderr, "Failed creating vulkan swapchain!\n");
-        exit(2);
-    }
-
-    int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
-    if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
-        fprintf(stderr, "Failed resizing vulkan swapchain!\n");
-        exit(2);
-    }
-
-    if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
-        printf("Note: window dimensions differ (got %dx%d)\n", w, h);
-
-    rd_priv_ctx->y_tex = NULL;
-    rd_priv_ctx->u_tex = NULL;
-    rd_priv_ctx->v_tex = NULL;
-
-    rd_priv_ctx->renderer = NULL;
-
-    return rd_priv_ctx;
-}
-
-static void placebo_renderer_destroy(void *cookie)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    pl_renderer_destroy(&(rd_priv_ctx->renderer));
-    pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_tex));
-    pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_tex));
-    pl_tex_destroy(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_tex));
-    pl_swapchain_destroy(&(rd_priv_ctx->swapchain));
-    pl_vulkan_destroy(&(rd_priv_ctx->vk));
-    vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
-    pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
-    pl_context_destroy(&(rd_priv_ctx->ctx));
-}
-
-static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-    if (rd_priv_ctx->y_tex == NULL) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return;
-    }
-
-    // Prepare rendering
-    if (rd_priv_ctx->renderer == NULL) {
-        rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->vk->gpu);
-    }
-
-    struct pl_swapchain_frame frame;
-    bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame);
-    if (!ok) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return;
-    }
-
-    const struct pl_tex *img = rd_priv_ctx->y_plane.texture;
-    struct pl_image image = {
-        .num_planes = 3,
-        .planes     = { rd_priv_ctx->y_plane, rd_priv_ctx->u_plane, rd_priv_ctx->v_plane },
-        .repr       = pl_color_repr_hdtv,
-        .color      = pl_color_space_unknown,
-        .width      = img->params.w,
-        .height     = img->params.h,
-    };
-
-    struct pl_render_params render_params = {0};
-    if (settings->highquality)
-        render_params = pl_render_default_params;
-
-    struct pl_render_target target;
-    pl_render_target_from_swapchain(&target, &frame);
-    target.profile = (struct pl_icc_profile) {
-        .data = NULL,
-        .len = 0,
-    };
-
-    if (!pl_render_image(rd_priv_ctx->renderer, &image, &target, &render_params)) {
-        fprintf(stderr, "Failed rendering frame!\n");
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return;
-    }
-
-    ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
-    if (!ok) {
-        fprintf(stderr, "Failed submitting frame!\n");
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return;
-    }
-
-    pl_swapchain_swap_buffers(rd_priv_ctx->swapchain);
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static int placebo_upload_planes(void *cookie, Dav1dPicture *dav1d_pic,
-                                 const Dav1dPlaySettings *settings)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    if (dav1d_pic == NULL) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return 0;
-    }
-
-    int width = dav1d_pic->p.w;
-    int height = dav1d_pic->p.h;
-
-    enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
-
-    if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
-        fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
-        exit(50);
-    }
-
-    struct pl_plane_data data_y = {
-        .type           = PL_FMT_UNORM,
-        .width          = width,
-        .height         = height,
-        .pixel_stride   = 1,
-        .row_stride     = dav1d_pic->stride[0],
-        .component_size = {8},
-        .component_map  = {0},
-    };
-
-    struct pl_plane_data data_u = {
-        .type           = PL_FMT_UNORM,
-        .width          = width/2,
-        .height         = height/2,
-        .pixel_stride   = 1,
-        .row_stride     = dav1d_pic->stride[1],
-        .component_size = {8},
-        .component_map  = {1},
-    };
-
-    struct pl_plane_data data_v = {
-        .type           = PL_FMT_UNORM,
-        .width          = width/2,
-        .height         = height/2,
-        .pixel_stride   = 1,
-        .row_stride     = dav1d_pic->stride[1],
-        .component_size = {8},
-        .component_map  = {2},
-    };
-
-    if (settings->zerocopy) {
-        const struct pl_buf *buf = dav1d_pic->allocator_data;
-        assert(buf);
-        data_y.buf = data_u.buf = data_v.buf = buf;
-        data_y.buf_offset = (uintptr_t) dav1d_pic->data[0] - (uintptr_t) buf->data;
-        data_u.buf_offset = (uintptr_t) dav1d_pic->data[1] - (uintptr_t) buf->data;
-        data_v.buf_offset = (uintptr_t) dav1d_pic->data[2] - (uintptr_t) buf->data;
-    } else {
-        data_y.pixels = dav1d_pic->data[0];
-        data_u.pixels = dav1d_pic->data[1];
-        data_v.pixels = dav1d_pic->data[2];
-    }
-
-    bool ok = true;
-    ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->y_plane), &(rd_priv_ctx->y_tex), &data_y);
-    ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->u_plane), &(rd_priv_ctx->u_tex), &data_u);
-    ok &= pl_upload_plane(rd_priv_ctx->vk->gpu, &(rd_priv_ctx->v_plane), &(rd_priv_ctx->v_tex), &data_v);
-
-    pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->u_plane.shift_x, &rd_priv_ctx->u_plane.shift_y);
-    pl_chroma_location_offset(PL_CHROMA_LEFT, &rd_priv_ctx->v_plane.shift_x, &rd_priv_ctx->v_plane.shift_y);
-
-    if (!ok) {
-        fprintf(stderr, "Failed uploading planes!\n");
-    }
-
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-    return !ok;
-}
-
-// Align to power of 2
-#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
-
-static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
-    int ret = DAV1D_ERR(ENOMEM);
-
-    // Copied from dav1d_default_picture_alloc
-    const int hbd = p->p.bpc > 8;
-    const int aligned_w = ALIGN2(p->p.w, 128);
-    const int aligned_h = ALIGN2(p->p.h, 128);
-    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
-    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
-    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
-    p->stride[0] = aligned_w << hbd;
-    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
-
-    // Align strides up to multiples of the GPU performance hints
-    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
-    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
-
-    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
-    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
-    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
-    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
-
-    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
-    // even in the case that the driver gives us insane alignments
-    const size_t pic_size = y_sz + 2 * uv_sz;
-    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
-
-    // Validate size limitations
-    if (total_size > gpu->limits.max_xfer_size) {
-        printf("alloc of %zu bytes exceeds limits\n", total_size);
-        goto err;
-    }
-
-    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
-        .type = PL_BUF_TEX_TRANSFER,
-        .host_mapped = true,
-        .size = total_size,
-        .memory_type = PL_BUF_MEM_HOST,
-        .user_data = p,
-    });
-
-    if (!buf) {
-        printf("alloc of GPU mapped buffer failed\n");
-        goto err;
-    }
-
-    assert(buf->data);
-    uintptr_t base = (uintptr_t) buf->data, data[3];
-    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
-    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
-    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
-
-    // Sanity check offset alignment for the sake of debugging
-    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
-        data[1] - base != ALIGN2(data[1] - base, off_align) ||
-        data[2] - base != ALIGN2(data[2] - base, off_align))
-    {
-        printf("GPU buffer horribly misaligned, expect slowdown!\n");
-    }
-
-    p->allocator_data = (void *) buf;
-    p->data[0] = (void *) data[0];
-    p->data[1] = (void *) data[1];
-    p->data[2] = (void *) data[2];
-    ret = 0;
-
-    // fall through
-err:
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-    return ret;
-}
-
-static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-    assert(pic->allocator_data);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-    const struct pl_gpu *gpu = rd_priv_ctx->vk->gpu;
-    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static const Dav1dPlayRenderInfo renderer_info = {
-    .create_renderer = placebo_renderer_create,
-    .destroy_renderer = placebo_renderer_destroy,
-    .render = placebo_render,
-    .update_frame = placebo_upload_planes,
-    .alloc_pic = placebo_alloc_pic,
-    .release_pic = placebo_release_pic,
-};
-
-#else
-
-/**
- * Renderer context for SDL
- */
-typedef struct renderer_priv_ctx
-{
-    // SDL renderer
-    SDL_Renderer *renderer;
-    // Lock protecting access to the texture
-    SDL_mutex *lock;
-    // Texture to render
-    SDL_Texture *tex;
-} Dav1dPlayRendererPrivateContext;
-
-static void *sdl_renderer_create(void *data)
-{
-    SDL_Window *win = data;
-
-    // Alloc
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
-    if (rd_priv_ctx == NULL) {
-        return NULL;
-    }
-
-    // Create renderer
-    rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
-    // Set scale quality
-    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear");
-
-    // Create Mutex
-    rd_priv_ctx->lock = SDL_CreateMutex();
-    if (rd_priv_ctx->lock == NULL) {
-        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
-        free(rd_priv_ctx);
-        return NULL;
-    }
-
-    rd_priv_ctx->tex = NULL;
-
-    return rd_priv_ctx;
-}
-
-static void sdl_renderer_destroy(void *cookie)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_DestroyRenderer(rd_priv_ctx->renderer);
-    SDL_DestroyMutex(rd_priv_ctx->lock);
-    free(rd_priv_ctx);
-}
-
-static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    if (rd_priv_ctx->tex == NULL) {
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return;
-    }
-
-    // Display the frame
-    SDL_RenderClear(rd_priv_ctx->renderer);
-    SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL);
-    SDL_RenderPresent(rd_priv_ctx->renderer);
-
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-}
-
-static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
-                              const Dav1dPlaySettings *settings)
-{
-    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
-    assert(rd_priv_ctx != NULL);
-
-    SDL_LockMutex(rd_priv_ctx->lock);
-
-    if (dav1d_pic == NULL) {
-        rd_priv_ctx->tex = NULL;
-        SDL_UnlockMutex(rd_priv_ctx->lock);
-        return 0;
-    }
-
-    int width = dav1d_pic->p.w;
-    int height = dav1d_pic->p.h;
-    int tex_w = width;
-    int tex_h = height;
-
-    enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
-
-    if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
-        fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
-        exit(50);
-    }
-
-    SDL_Texture *texture = rd_priv_ctx->tex;
-    if (texture != NULL) {
-        SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h);
-        if (tex_w != width || tex_h != height) {
-            SDL_DestroyTexture(texture);
-            texture = NULL;
-        }
-    }
-
-    if (texture == NULL) {
-        texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
-            SDL_TEXTUREACCESS_STREAMING, width, height);
-    }
-
-    SDL_UpdateYUVTexture(texture, NULL,
-        dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y
-        dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U
-        dav1d_pic->data[2], (int)dav1d_pic->stride[1]  // V
-        );
-
-    rd_priv_ctx->tex = texture;
-    SDL_UnlockMutex(rd_priv_ctx->lock);
-    return 0;
-}
-
-static const Dav1dPlayRenderInfo renderer_info = {
-    .create_renderer = sdl_renderer_create,
-    .destroy_renderer = sdl_renderer_destroy,
-    .render = sdl_render,
-    .update_frame = sdl_update_texture
-};
-
-#endif
+// Selected renderer callbacks and cookie
+static const Dav1dPlayRenderInfo *renderer_info = { NULL };
 
 /**
  * Render context structure
@@ -722,8 +53,6 @@ typedef struct render_context
     Dav1dPlaySettings settings;
     Dav1dSettings lib_settings;
 
-    // Renderer callbacks
-    Dav1dPlayRenderInfo *renderer_info;
     // Renderer private data (passed to callbacks)
     void *rd_priv;
 
@@ -768,7 +97,9 @@ static void dp_settings_print_usage(const char *const app,
             " --tilethreads $num:   number of tile threads (default: 1)\n"
             " --highquality:        enable high quality rendering\n"
             " --zerocopy/-z:        enable zero copy upload path\n"
-            " --version/-v:         print version and exit\n");
+            " --gpugrain/-g:        enable GPU grain synthesis\n"
+            " --version/-v:         print version and exit\n"
+            " --renderer/-r:        select renderer backend (default: auto)\n");
     exit(1);
 }
 
@@ -791,7 +122,7 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
     Dav1dSettings *lib_settings = &rd_ctx->lib_settings;
 
     // Short options
-    static const char short_opts[] = "i:vuz";
+    static const char short_opts[] = "i:vuzgr:";
 
     enum {
         ARG_FRAME_THREADS = 256,
@@ -808,6 +139,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
         { "tilethreads",    1, NULL, ARG_TILE_THREADS },
         { "highquality",    0, NULL, ARG_HIGH_QUALITY },
         { "zerocopy",       0, NULL, 'z' },
+        { "gpugrain",       0, NULL, 'g' },
+        { "renderer",       0, NULL, 'r'},
         { NULL,             0, NULL, 0 },
     };
 
@@ -824,15 +157,15 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
                 break;
             case ARG_HIGH_QUALITY:
                 settings->highquality = true;
-#ifndef HAVE_PLACEBO_VULKAN
-                fprintf(stderr, "warning: --highquality requires libplacebo\n");
-#endif
                 break;
             case 'z':
                 settings->zerocopy = true;
-#ifndef HAVE_PLACEBO_VULKAN
-                fprintf(stderr, "warning: --zerocopy requires libplacebo\n");
-#endif
+                break;
+            case 'g':
+                settings->gpugrain = true;
+                break;
+            case 'r':
+                settings->renderer_name = optarg;
                 break;
             case ARG_FRAME_THREADS:
                 lib_settings->n_frame_threads =
@@ -852,6 +185,8 @@ static void dp_rd_ctx_parse_args(Dav1dPlayRenderContext *rd_ctx,
             "Extra/unused arguments found, e.g. '%s'\n", argv[optind]);
     if (!settings->inputfile)
         dp_settings_print_usage(argv[0], "Input file (-i/--input) is required");
+    if (settings->renderer_name && strcmp(settings->renderer_name, "auto") == 0)
+        settings->renderer_name = NULL;
 }
 
 /**
@@ -861,7 +196,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx)
 {
     assert(rd_ctx != NULL);
 
-    renderer_info.destroy_renderer(rd_ctx->rd_priv);
+    renderer_info->destroy_renderer(rd_ctx->rd_priv);
     dp_fifo_destroy(rd_ctx->fifo);
     SDL_DestroyMutex(rd_ctx->lock);
     free(rd_ctx);
@@ -873,7 +208,7 @@ static void dp_rd_ctx_destroy(Dav1dPlayRenderContext *rd_ctx)
  * \note  The Dav1dPlayRenderContext must be destroyed
  *        again by using dp_rd_ctx_destroy.
  */
-static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
+static Dav1dPlayRenderContext *dp_rd_ctx_create(int argc, char **argv)
 {
     Dav1dPlayRenderContext *rd_ctx;
 
@@ -907,7 +242,22 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
         return NULL;
     }
 
-    rd_ctx->rd_priv = renderer_info.create_renderer(rd_data);
+    // Parse and validate arguments
+    dav1d_default_settings(&rd_ctx->lib_settings);
+    memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
+    dp_rd_ctx_parse_args(rd_ctx, argc, argv);
+
+    // Select renderer
+    renderer_info = dp_get_renderer(rd_ctx->settings.renderer_name);
+
+    if (renderer_info == NULL) {
+        printf("No suitable rendered matching %s found.\n",
+            (rd_ctx->settings.renderer_name) ? rd_ctx->settings.renderer_name : "auto");
+    } else {
+        printf("Using %s renderer\n", renderer_info->name);
+    }
+
+    rd_ctx->rd_priv = (renderer_info) ? renderer_info->create_renderer() : NULL;
     if (rd_ctx->rd_priv == NULL) {
         SDL_DestroyMutex(rd_ctx->lock);
         dp_fifo_destroy(rd_ctx->fifo);
@@ -915,9 +265,6 @@ static Dav1dPlayRenderContext *dp_rd_ctx_create(void *rd_data)
         return NULL;
     }
 
-    dav1d_default_settings(&rd_ctx->lib_settings);
-    memset(&rd_ctx->settings, 0, sizeof(rd_ctx->settings));
-
     rd_ctx->last_pts = 0;
     rd_ctx->last_ticks = 0;
     rd_ctx->current_pts = 0;
@@ -949,7 +296,7 @@ static void dp_rd_ctx_post_event(Dav1dPlayRenderContext *rd_ctx, uint32_t code)
 static void dp_rd_ctx_update_with_dav1d_picture(Dav1dPlayRenderContext *rd_ctx,
     Dav1dPicture *dav1d_pic)
 {
-    renderer_info.update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
+    renderer_info->update_frame(rd_ctx->rd_priv, dav1d_pic, &rd_ctx->settings);
     rd_ctx->current_pts = dav1d_pic->m.timestamp;
 }
 
@@ -1004,7 +351,7 @@ static void dp_rd_ctx_render(Dav1dPlayRenderContext *rd_ctx)
         fprintf(stderr, "Frame displayed %f seconds too late\n", wait_time/(float)1000);
     }
 
-    renderer_info.render(rd_ctx->rd_priv, &rd_ctx->settings);
+    renderer_info->render(rd_ctx->rd_priv, &rd_ctx->settings);
 
     rd_ctx->last_ticks = SDL_GetTicks();
 }
@@ -1152,7 +499,6 @@ static int decoder_thread_main(void *cookie)
 int main(int argc, char **argv)
 {
     SDL_Thread *decoder_thread;
-    SDL_Window *win = NULL;
 
     // Check for version mismatch between library and tool
     const char *version = dav1d_version();
@@ -1166,34 +512,30 @@ int main(int argc, char **argv)
     if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_TIMER) < 0)
         return 10;
 
-    // Create Window and Renderer
-    int window_flags = SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI;
-#ifdef HAVE_PLACEBO_VULKAN
-    window_flags |= SDL_WINDOW_VULKAN;
-#endif
-    win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
-        WINDOW_WIDTH, WINDOW_HEIGHT, window_flags);
-    SDL_SetWindowResizable(win, SDL_TRUE);
-
     // Create render context
-    Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(win);
+    Dav1dPlayRenderContext *rd_ctx = dp_rd_ctx_create(argc, argv);
     if (rd_ctx == NULL) {
         fprintf(stderr, "Failed creating render context\n");
         return 5;
     }
 
-    // Parse and validate arguments
-    dp_rd_ctx_parse_args(rd_ctx, argc, argv);
-
     if (rd_ctx->settings.zerocopy) {
-        if (renderer_info.alloc_pic) {
+        if (renderer_info->alloc_pic) {
             rd_ctx->lib_settings.allocator = (Dav1dPicAllocator) {
                 .cookie = rd_ctx->rd_priv,
-                .alloc_picture_callback = renderer_info.alloc_pic,
-                .release_picture_callback = renderer_info.release_pic,
+                .alloc_picture_callback = renderer_info->alloc_pic,
+                .release_picture_callback = renderer_info->release_pic,
             };
         } else {
-            fprintf(stderr, "--zerocopy unsupported by compiled renderer\n");
+            fprintf(stderr, "--zerocopy unsupported by selected renderer\n");
+        }
+    }
+
+    if (rd_ctx->settings.gpugrain) {
+        if (renderer_info->supports_gpu_grain) {
+            rd_ctx->lib_settings.apply_grain = 0;
+        } else {
+            fprintf(stderr, "--gpugrain unsupported by selected renderer\n");
         }
     }
 
@@ -1207,6 +549,10 @@ int main(int argc, char **argv)
         if (SDL_WaitEvent(&e)) {
             if (e.type == SDL_QUIT) {
                 dp_rd_ctx_request_shutdown(rd_ctx);
+            } else if (e.type == SDL_WINDOWEVENT) {
+                if (e.window.event == SDL_WINDOWEVENT_SIZE_CHANGED) {
+                    // TODO: Handle window resizes
+                }
             } else if (e.type == rd_ctx->renderer_event_type) {
                 if (e.user.code == DAV1D_EVENT_NEW_FRAME) {
                     // Dequeue frame and update the render context with it
@@ -1232,7 +578,6 @@ int main(int argc, char **argv)
     SDL_WaitThread(decoder_thread, &decoder_ret);
 
     dp_rd_ctx_destroy(rd_ctx);
-    SDL_DestroyWindow(win);
 
     return decoder_ret;
 }
diff --git a/ffmpeg/JNI/dav1d/examples/dp_fifo.c b/ffmpeg/JNI/dav1d/examples/dp_fifo.c
new file mode 100644
index 000000000..243d2e933
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/examples/dp_fifo.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <SDL.h>
+#include <assert.h>
+
+#include "dp_fifo.h"
+
+// FIFO structure
+struct dp_fifo
+{
+    SDL_mutex *lock;
+    SDL_cond *cond_change;
+    size_t capacity;
+    size_t count;
+    void **entries;
+};
+
+
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity)
+{
+    Dav1dPlayPtrFifo *fifo;
+
+    assert(capacity > 0);
+    if (capacity <= 0)
+        return NULL;
+
+    fifo = malloc(sizeof(*fifo));
+    if (fifo == NULL)
+        return NULL;
+
+    fifo->capacity = capacity;
+    fifo->count = 0;
+
+    fifo->lock = SDL_CreateMutex();
+    if (fifo->lock == NULL) {
+        free(fifo);
+        return NULL;
+    }
+    fifo->cond_change = SDL_CreateCond();
+    if (fifo->cond_change == NULL) {
+        SDL_DestroyMutex(fifo->lock);
+        free(fifo);
+        return NULL;
+    }
+
+    fifo->entries = calloc(capacity, sizeof(void*));
+    if (fifo->entries == NULL) {
+        dp_fifo_destroy(fifo);
+        return NULL;
+    }
+
+    return fifo;
+}
+
+// Destroy FIFO
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo)
+{
+    assert(fifo->count == 0);
+    SDL_DestroyMutex(fifo->lock);
+    SDL_DestroyCond(fifo->cond_change);
+    free(fifo->entries);
+    free(fifo);
+}
+
+// Push to FIFO
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element)
+{
+    SDL_LockMutex(fifo->lock);
+    while (fifo->count == fifo->capacity)
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    fifo->entries[fifo->count++] = element;
+    if (fifo->count == 1)
+        SDL_CondSignal(fifo->cond_change);
+    SDL_UnlockMutex(fifo->lock);
+}
+
+// Helper that shifts the FIFO array
+static void *dp_fifo_array_shift(void **arr, size_t len)
+{
+    void *shifted_element = arr[0];
+    for (size_t i = 1; i < len; ++i)
+        arr[i-1] = arr[i];
+    return shifted_element;
+}
+
+// Get item from FIFO
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo)
+{
+    SDL_LockMutex(fifo->lock);
+    while (fifo->count == 0)
+        SDL_CondWait(fifo->cond_change, fifo->lock);
+    void *res = dp_fifo_array_shift(fifo->entries, fifo->count--);
+    if (fifo->count == fifo->capacity - 1)
+        SDL_CondSignal(fifo->cond_change);
+    SDL_UnlockMutex(fifo->lock);
+    return res;
+}
+
+
diff --git a/ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h b/ffmpeg/JNI/dav1d/examples/dp_fifo.h
similarity index 59%
rename from ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h
rename to ffmpeg/JNI/dav1d/examples/dp_fifo.h
index 3caccad31..a94b089b2 100644
--- a/ffmpeg/JNI/dav1d/builddir/include/dav1d/version.h
+++ b/ffmpeg/JNI/dav1d/examples/dp_fifo.h
@@ -24,11 +24,38 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef DAV1D_VERSION_H
-#define DAV1D_VERSION_H
+/*
+ * Dav1dPlay FIFO helper
+ */
 
-#define DAV1D_API_VERSION_MAJOR 3
-#define DAV1D_API_VERSION_MINOR 0
-#define DAV1D_API_VERSION_PATCH 0
+typedef struct dp_fifo Dav1dPlayPtrFifo;
 
-#endif /* DAV1D_VERSION_H */
+/* Create a FIFO
+ *
+ * Creates a FIFO with the given capacity.
+ * If the capacity is reached, new inserts into the FIFO
+ * will block until enough space is available again.
+ */
+Dav1dPlayPtrFifo *dp_fifo_create(size_t capacity);
+
+/* Destroy a FIFO
+ *
+ * The FIFO must be empty before it is destroyed!
+ */
+void dp_fifo_destroy(Dav1dPlayPtrFifo *fifo);
+
+/* Shift FIFO
+ *
+ * Return the first item from the FIFO, thereby removing it from
+ * the FIFO and making room for new entries.
+ */
+void *dp_fifo_shift(Dav1dPlayPtrFifo *fifo);
+
+/* Push to FIFO
+ *
+ * Add an item to the end of the FIFO.
+ * If the FIFO is full, this call will block until there is again enough
+ * space in the FIFO, so calling this from the "consumer" thread if no
+ * other thread will call dp_fifo_shift will lead to a deadlock.
+ */
+void dp_fifo_push(Dav1dPlayPtrFifo *fifo, void *element);
diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer.h b/ffmpeg/JNI/dav1d/examples/dp_renderer.h
new file mode 100644
index 000000000..4c6f2954f
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/examples/dp_renderer.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <inttypes.h>
+#include <string.h>
+
+#include "dav1d/dav1d.h"
+
+#include <SDL.h>
+#ifdef HAVE_PLACEBO
+# include <libplacebo/config.h>
+#endif
+
+// Check libplacebo Vulkan rendering
+#if defined(HAVE_VULKAN) && defined(SDL_VIDEO_VULKAN)
+# if defined(PL_HAVE_VULKAN) && PL_HAVE_VULKAN
+#  define HAVE_RENDERER_PLACEBO
+#  define HAVE_PLACEBO_VULKAN
+# endif
+#endif
+
+// Check libplacebo OpenGL rendering
+#if defined(PL_HAVE_OPENGL) && PL_HAVE_OPENGL
+# define HAVE_RENDERER_PLACEBO
+# define HAVE_PLACEBO_OPENGL
+#endif
+
+/**
+ * Settings structure
+ * Hold all settings available for the player,
+ * this is usually filled by parsing arguments
+ * from the console.
+ */
+typedef struct {
+    const char *inputfile;
+    const char *renderer_name;
+    int highquality;
+    int untimed;
+    int zerocopy;
+    int gpugrain;
+} Dav1dPlaySettings;
+
+#define WINDOW_WIDTH  910
+#define WINDOW_HEIGHT 512
+
+#define DAV1D_EVENT_NEW_FRAME 1
+#define DAV1D_EVENT_DEC_QUIT  2
+
+/**
+ * Renderer info
+ */
+typedef struct rdr_info
+{
+    // Renderer name
+    const char *name;
+    // Cookie passed to the renderer implementation callbacks
+    void *cookie;
+    // Callback to create the renderer
+    void* (*create_renderer)();
+    // Callback to destroy the renderer
+    void (*destroy_renderer)(void *cookie);
+    // Callback to the render function that renders a prevously sent frame
+    void (*render)(void *cookie, const Dav1dPlaySettings *settings);
+    // Callback to the send frame function
+    int (*update_frame)(void *cookie, Dav1dPicture *dav1d_pic,
+                        const Dav1dPlaySettings *settings);
+    // Callback for alloc/release pictures (optional)
+    int (*alloc_pic)(Dav1dPicture *pic, void *cookie);
+    void (*release_pic)(Dav1dPicture *pic, void *cookie);
+    // Whether or not this renderer can apply on-GPU film grain synthesis
+    int supports_gpu_grain;
+} Dav1dPlayRenderInfo;
+
+extern const Dav1dPlayRenderInfo rdr_placebo_vk;
+extern const Dav1dPlayRenderInfo rdr_placebo_gl;
+extern const Dav1dPlayRenderInfo rdr_sdl;
+
+// Available renderes ordered by priority
+static const Dav1dPlayRenderInfo* const dp_renderers[] = {
+    &rdr_placebo_vk,
+    &rdr_placebo_gl,
+    &rdr_sdl,
+};
+
+static inline const Dav1dPlayRenderInfo *dp_get_renderer(const char *name)
+{
+    for (size_t i = 0; i < (sizeof(dp_renderers)/sizeof(*dp_renderers)); ++i)
+    {
+        if (dp_renderers[i]->name == NULL)
+            continue;
+
+        if (name == NULL || strcmp(name, dp_renderers[i]->name) == 0) {
+            return dp_renderers[i];
+        }
+    }
+    return NULL;
+}
+
+static inline SDL_Window *dp_create_sdl_window(int window_flags)
+{
+    SDL_Window *win;
+    window_flags |= SDL_WINDOW_SHOWN | SDL_WINDOW_ALLOW_HIGHDPI;
+
+    win = SDL_CreateWindow("Dav1dPlay", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED,
+        WINDOW_WIDTH, WINDOW_HEIGHT, window_flags);
+    SDL_SetWindowResizable(win, SDL_TRUE);
+
+    return win;
+}
diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c b/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c
new file mode 100644
index 000000000..beb1d42ad
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/examples/dp_renderer_placebo.c
@@ -0,0 +1,723 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#ifdef HAVE_RENDERER_PLACEBO
+#include <assert.h>
+
+#include <libplacebo/renderer.h>
+#include <libplacebo/utils/upload.h>
+
+#ifdef HAVE_PLACEBO_VULKAN
+# include <libplacebo/vulkan.h>
+# include <SDL_vulkan.h>
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+# include <libplacebo/opengl.h>
+# include <SDL_opengl.h>
+#endif
+
+
+/**
+ * Renderer context for libplacebo
+ */
+typedef struct renderer_priv_ctx
+{
+    // SDL window
+    SDL_Window *win;
+    // Placebo context
+    struct pl_context *ctx;
+    // Placebo renderer
+    struct pl_renderer *renderer;
+#ifdef HAVE_PLACEBO_VULKAN
+    // Placebo Vulkan handle
+    const struct pl_vulkan *vk;
+    // Placebo Vulkan instance
+    const struct pl_vk_inst *vk_inst;
+    // Vulkan surface
+    VkSurfaceKHR surf;
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+    // Placebo OpenGL handle
+    const struct pl_opengl *gl;
+#endif
+    // Placebo GPU
+    const struct pl_gpu *gpu;
+    // Placebo swapchain
+    const struct pl_swapchain *swapchain;
+    // Lock protecting access to the texture
+    SDL_mutex *lock;
+    // Image to render, and planes backing them
+    struct pl_image image;
+    const struct pl_tex *plane_tex[3];
+} Dav1dPlayRendererPrivateContext;
+
+static Dav1dPlayRendererPrivateContext*
+    placebo_renderer_create_common(int window_flags)
+{
+    // Create Window
+    SDL_Window *sdlwin = dp_create_sdl_window(window_flags | SDL_WINDOW_RESIZABLE);
+    if (sdlwin == NULL)
+        return NULL;
+
+    // Alloc
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+    if (rd_priv_ctx == NULL) {
+        return NULL;
+    }
+
+    *rd_priv_ctx = (Dav1dPlayRendererPrivateContext) {0};
+    rd_priv_ctx->win = sdlwin;
+
+    // Init libplacebo
+    rd_priv_ctx->ctx = pl_context_create(PL_API_VER, &(struct pl_context_params) {
+        .log_cb     = pl_log_color,
+#ifndef NDEBUG
+        .log_level  = PL_LOG_DEBUG,
+#else
+        .log_level  = PL_LOG_WARN,
+#endif
+    });
+    if (rd_priv_ctx->ctx == NULL) {
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    // Create Mutex
+    rd_priv_ctx->lock = SDL_CreateMutex();
+    if (rd_priv_ctx->lock == NULL) {
+        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+        pl_context_destroy(&(rd_priv_ctx->ctx));
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    return rd_priv_ctx;
+}
+
+#ifdef HAVE_PLACEBO_OPENGL
+static void *placebo_renderer_create_gl()
+{
+    SDL_Window *sdlwin = NULL;
+    SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+
+    // Common init
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+        placebo_renderer_create_common(SDL_WINDOW_OPENGL);
+
+    if (rd_priv_ctx == NULL)
+        return NULL;
+    sdlwin = rd_priv_ctx->win;
+
+    // Init OpenGL
+    struct pl_opengl_params params = pl_opengl_default_params;
+# ifndef NDEBUG
+    params.debug = true;
+# endif
+
+    SDL_GLContext glcontext = SDL_GL_CreateContext(sdlwin);
+    SDL_GL_MakeCurrent(sdlwin, glcontext);
+
+    rd_priv_ctx->gl = pl_opengl_create(rd_priv_ctx->ctx, &params);
+    if (!rd_priv_ctx->gl) {
+        fprintf(stderr, "Failed creating opengl device!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->swapchain = pl_opengl_create_swapchain(rd_priv_ctx->gl,
+        &(struct pl_opengl_swapchain_params) {
+            .swap_buffers = (void (*)(void *)) SDL_GL_SwapWindow,
+            .priv = sdlwin,
+        });
+
+    if (!rd_priv_ctx->swapchain) {
+        fprintf(stderr, "Failed creating opengl swapchain!\n");
+        exit(2);
+    }
+
+    int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+    SDL_GL_GetDrawableSize(sdlwin, &w, &h);
+
+    if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+        fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->gpu = rd_priv_ctx->gl->gpu;
+
+    if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+        printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+    return rd_priv_ctx;
+}
+#endif
+
+#ifdef HAVE_PLACEBO_VULKAN
+static void *placebo_renderer_create_vk()
+{
+    SDL_Window *sdlwin = NULL;
+
+    // Common init
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx =
+        placebo_renderer_create_common(SDL_WINDOW_VULKAN);
+
+    if (rd_priv_ctx == NULL)
+        return NULL;
+    sdlwin = rd_priv_ctx->win;
+
+    // Init Vulkan
+    unsigned num = 0;
+    if (!SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, NULL)) {
+        fprintf(stderr, "Failed enumerating Vulkan extensions: %s\n", SDL_GetError());
+        exit(1);
+    }
+
+    const char **extensions = malloc(num * sizeof(const char *));
+    assert(extensions);
+
+    SDL_bool ok = SDL_Vulkan_GetInstanceExtensions(sdlwin, &num, extensions);
+    if (!ok) {
+        fprintf(stderr, "Failed getting Vk instance extensions\n");
+        exit(1);
+    }
+
+    if (num > 0) {
+        printf("Requesting %d additional Vulkan extensions:\n", num);
+        for (unsigned i = 0; i < num; i++)
+            printf("    %s\n", extensions[i]);
+    }
+
+    struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
+    iparams.extensions = extensions;
+    iparams.num_extensions = num;
+
+    rd_priv_ctx->vk_inst = pl_vk_inst_create(rd_priv_ctx->ctx, &iparams);
+    if (!rd_priv_ctx->vk_inst) {
+        fprintf(stderr, "Failed creating Vulkan instance!\n");
+        exit(1);
+    }
+    free(extensions);
+
+    if (!SDL_Vulkan_CreateSurface(sdlwin, rd_priv_ctx->vk_inst->instance, &rd_priv_ctx->surf)) {
+        fprintf(stderr, "Failed creating vulkan surface: %s\n", SDL_GetError());
+        exit(1);
+    }
+
+    struct pl_vulkan_params params = pl_vulkan_default_params;
+    params.instance = rd_priv_ctx->vk_inst->instance;
+    params.surface = rd_priv_ctx->surf;
+    params.allow_software = true;
+
+    rd_priv_ctx->vk = pl_vulkan_create(rd_priv_ctx->ctx, &params);
+    if (!rd_priv_ctx->vk) {
+        fprintf(stderr, "Failed creating vulkan device!\n");
+        exit(2);
+    }
+
+    // Create swapchain
+    rd_priv_ctx->swapchain = pl_vulkan_create_swapchain(rd_priv_ctx->vk,
+        &(struct pl_vulkan_swapchain_params) {
+            .surface = rd_priv_ctx->surf,
+            .present_mode = VK_PRESENT_MODE_IMMEDIATE_KHR,
+        });
+
+    if (!rd_priv_ctx->swapchain) {
+        fprintf(stderr, "Failed creating vulkan swapchain!\n");
+        exit(2);
+    }
+
+    int w = WINDOW_WIDTH, h = WINDOW_HEIGHT;
+    if (!pl_swapchain_resize(rd_priv_ctx->swapchain, &w, &h)) {
+        fprintf(stderr, "Failed resizing vulkan swapchain!\n");
+        exit(2);
+    }
+
+    rd_priv_ctx->gpu = rd_priv_ctx->vk->gpu;
+
+    if (w != WINDOW_WIDTH || h != WINDOW_HEIGHT)
+        printf("Note: window dimensions differ (got %dx%d)\n", w, h);
+
+    return rd_priv_ctx;
+}
+#endif
+
+static void placebo_renderer_destroy(void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    pl_renderer_destroy(&(rd_priv_ctx->renderer));
+    pl_swapchain_destroy(&(rd_priv_ctx->swapchain));
+    for (int i = 0; i < 3; i++)
+        pl_tex_destroy(rd_priv_ctx->gpu, &(rd_priv_ctx->plane_tex[i]));
+
+#ifdef HAVE_PLACEBO_VULKAN
+    if (rd_priv_ctx->vk) {
+        pl_vulkan_destroy(&(rd_priv_ctx->vk));
+        vkDestroySurfaceKHR(rd_priv_ctx->vk_inst->instance, rd_priv_ctx->surf, NULL);
+        pl_vk_inst_destroy(&(rd_priv_ctx->vk_inst));
+    }
+#endif
+#ifdef HAVE_PLACEBO_OPENGL
+    if (rd_priv_ctx->gl)
+        pl_opengl_destroy(&(rd_priv_ctx->gl));
+#endif
+
+    SDL_DestroyWindow(rd_priv_ctx->win);
+
+    pl_context_destroy(&(rd_priv_ctx->ctx));
+}
+
+static void placebo_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    if (!rd_priv_ctx->image.num_planes) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    // Prepare rendering
+    if (rd_priv_ctx->renderer == NULL) {
+        rd_priv_ctx->renderer = pl_renderer_create(rd_priv_ctx->ctx, rd_priv_ctx->gpu);
+    }
+
+    struct pl_swapchain_frame frame;
+    bool ok = pl_swapchain_start_frame(rd_priv_ctx->swapchain, &frame);
+    if (!ok) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    struct pl_render_params render_params = {0};
+    if (settings->highquality)
+        render_params = pl_render_default_params;
+
+    struct pl_render_target target;
+    pl_render_target_from_swapchain(&target, &frame);
+    target.profile = (struct pl_icc_profile) {
+        .data = NULL,
+        .len = 0,
+    };
+
+#if PL_API_VER >= 66
+    pl_rect2df_aspect_copy(&target.dst_rect, &rd_priv_ctx->image.src_rect, 0.0);
+    if (pl_render_target_partial(&target))
+        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 0.0 });
+#endif
+
+    if (!pl_render_image(rd_priv_ctx->renderer, &rd_priv_ctx->image, &target, &render_params)) {
+        fprintf(stderr, "Failed rendering frame!\n");
+        pl_tex_clear(rd_priv_ctx->gpu, target.fbo, (float[4]){ 1.0 });
+    }
+
+    ok = pl_swapchain_submit_frame(rd_priv_ctx->swapchain);
+    if (!ok) {
+        fprintf(stderr, "Failed submitting frame!\n");
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    pl_swapchain_swap_buffers(rd_priv_ctx->swapchain);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int placebo_upload_image(void *cookie, Dav1dPicture *dav1d_pic,
+                                const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (dav1d_pic == NULL) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return 0;
+    }
+
+    int width = dav1d_pic->p.w;
+    int height = dav1d_pic->p.h;
+    int sub_x = 0, sub_y = 0;
+    int bytes = (dav1d_pic->p.bpc + 7) / 8; // rounded up
+    enum pl_chroma_location chroma_loc = PL_CHROMA_UNKNOWN;
+
+    struct pl_image *image = &rd_priv_ctx->image;
+    *image = (struct pl_image) {
+        .num_planes = 3,
+        .width      = width,
+        .height     = height,
+        .src_rect   = {0, 0, width, height},
+
+        .repr = {
+            .bits = {
+                .sample_depth = bytes * 8,
+                .color_depth = dav1d_pic->p.bpc,
+            },
+        },
+    };
+
+    // Figure out the correct plane dimensions/count
+    switch (dav1d_pic->p.layout) {
+    case DAV1D_PIXEL_LAYOUT_I400:
+        image->num_planes = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I420:
+        sub_x = sub_y = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I422:
+        sub_x = 1;
+        break;
+    case DAV1D_PIXEL_LAYOUT_I444:
+        break;
+    }
+
+    // Set the right colorspace metadata etc.
+    switch (dav1d_pic->seq_hdr->pri) {
+    case DAV1D_COLOR_PRI_UNKNOWN:   image->color.primaries = PL_COLOR_PRIM_UNKNOWN; break;
+    case DAV1D_COLOR_PRI_BT709:     image->color.primaries = PL_COLOR_PRIM_BT_709; break;
+    case DAV1D_COLOR_PRI_BT470M:    image->color.primaries = PL_COLOR_PRIM_BT_470M; break;
+    case DAV1D_COLOR_PRI_BT470BG:   image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+    case DAV1D_COLOR_PRI_BT601:     image->color.primaries = PL_COLOR_PRIM_BT_601_625; break;
+    case DAV1D_COLOR_PRI_BT2020:    image->color.primaries = PL_COLOR_PRIM_BT_2020; break;
+
+    case DAV1D_COLOR_PRI_XYZ:
+        // Handled below
+        assert(dav1d_pic->seq_hdr->mtrx == DAV1D_MC_IDENTITY);
+        break;
+
+    default:
+        printf("warning: unknown dav1d color primaries %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->pri);
+        break;
+    }
+
+    switch (dav1d_pic->seq_hdr->trc) {
+    case DAV1D_TRC_BT709:
+    case DAV1D_TRC_BT470M:
+    case DAV1D_TRC_BT470BG:
+    case DAV1D_TRC_BT601:
+    case DAV1D_TRC_SMPTE240:
+    case DAV1D_TRC_BT2020_10BIT:
+    case DAV1D_TRC_BT2020_12BIT:
+        // These all map to the effective "SDR" CRT-based EOTF, BT.1886
+        image->color.transfer = PL_COLOR_TRC_BT_1886;
+        break;
+
+    case DAV1D_TRC_UNKNOWN:     image->color.transfer = PL_COLOR_TRC_UNKNOWN; break;
+    case DAV1D_TRC_LINEAR:      image->color.transfer = PL_COLOR_TRC_LINEAR; break;
+    case DAV1D_TRC_SRGB:        image->color.transfer = PL_COLOR_TRC_SRGB; break;
+    case DAV1D_TRC_SMPTE2084:   image->color.transfer = PL_COLOR_TRC_PQ; break;
+    case DAV1D_TRC_HLG:         image->color.transfer = PL_COLOR_TRC_HLG; break;
+
+    default:
+        printf("warning: unknown dav1d color transfer %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->trc);
+        break;
+    }
+
+    switch (dav1d_pic->seq_hdr->mtrx) {
+    case DAV1D_MC_IDENTITY:
+        // This is going to be either RGB or XYZ
+        if (dav1d_pic->seq_hdr->pri == DAV1D_COLOR_PRI_XYZ) {
+            image->repr.sys = PL_COLOR_SYSTEM_XYZ;
+        } else {
+            image->repr.sys = PL_COLOR_SYSTEM_RGB;
+        }
+        break;
+
+    case DAV1D_MC_UNKNOWN:
+        // PL_COLOR_SYSTEM_UNKNOWN maps to RGB, so hard-code this one
+        image->repr.sys = pl_color_system_guess_ycbcr(width, height);
+        break;
+
+    case DAV1D_MC_BT709:        image->repr.sys = PL_COLOR_SYSTEM_BT_709; break;
+    case DAV1D_MC_BT601:        image->repr.sys = PL_COLOR_SYSTEM_BT_601; break;
+    case DAV1D_MC_SMPTE240:     image->repr.sys = PL_COLOR_SYSTEM_SMPTE_240M; break;
+    case DAV1D_MC_SMPTE_YCGCO:  image->repr.sys = PL_COLOR_SYSTEM_YCGCO; break;
+    case DAV1D_MC_BT2020_NCL:   image->repr.sys = PL_COLOR_SYSTEM_BT_2020_NC; break;
+    case DAV1D_MC_BT2020_CL:    image->repr.sys = PL_COLOR_SYSTEM_BT_2020_C; break;
+
+    case DAV1D_MC_ICTCP:
+        // This one is split up based on the actual HDR curve in use
+        if (dav1d_pic->seq_hdr->trc == DAV1D_TRC_HLG) {
+            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_HLG;
+        } else {
+            image->repr.sys = PL_COLOR_SYSTEM_BT_2100_PQ;
+        }
+        break;
+
+    default:
+        printf("warning: unknown dav1d color matrix %d.. ignoring, picture "
+               "may be very incorrect\n", dav1d_pic->seq_hdr->mtrx);
+        break;
+    }
+
+    if (dav1d_pic->seq_hdr->color_range) {
+        image->repr.levels = PL_COLOR_LEVELS_PC;
+    } else {
+        image->repr.levels = PL_COLOR_LEVELS_TV;
+    }
+
+    switch (dav1d_pic->seq_hdr->chr) {
+    case DAV1D_CHR_UNKNOWN:     chroma_loc = PL_CHROMA_UNKNOWN; break;
+    case DAV1D_CHR_VERTICAL:    chroma_loc = PL_CHROMA_LEFT; break;
+    case DAV1D_CHR_COLOCATED:   chroma_loc = PL_CHROMA_TOP_LEFT; break;
+    }
+
+#if PL_API_VER >= 63
+    if (settings->gpugrain && dav1d_pic->frame_hdr->film_grain.present) {
+        Dav1dFilmGrainData *src = &dav1d_pic->frame_hdr->film_grain.data;
+        struct pl_av1_grain_data *dst = &image->av1_grain;
+        *dst = (struct pl_av1_grain_data) {
+            .grain_seed     = src->seed,
+            .num_points_y   = src->num_y_points,
+            .chroma_scaling_from_luma = src->chroma_scaling_from_luma,
+            .num_points_uv  = { src->num_uv_points[0], src->num_uv_points[1] },
+            .scaling_shift  = src->scaling_shift,
+            .ar_coeff_lag   = src->ar_coeff_lag,
+            .ar_coeff_shift = src->ar_coeff_shift,
+            .grain_scale_shift = src->grain_scale_shift,
+            .uv_mult        = { src->uv_mult[0], src->uv_mult[1] },
+            .uv_mult_luma   = { src->uv_luma_mult[0], src->uv_luma_mult[1] },
+            .uv_offset      = { src->uv_offset[0], src->uv_offset[1] },
+            .overlap        = src->overlap_flag,
+        };
+
+        assert(sizeof(dst->points_y) == sizeof(src->y_points));
+        assert(sizeof(dst->points_uv) == sizeof(src->uv_points));
+        assert(sizeof(dst->ar_coeffs_y) == sizeof(src->ar_coeffs_y));
+        memcpy(dst->points_y, src->y_points, sizeof(src->y_points));
+        memcpy(dst->points_uv, src->uv_points, sizeof(src->uv_points));
+        memcpy(dst->ar_coeffs_y, src->ar_coeffs_y, sizeof(src->ar_coeffs_y));
+
+        // this one has different row sizes for alignment
+        for (int c = 0; c < 2; c++) {
+            for (int i = 0; i < 25; i++)
+                dst->ar_coeffs_uv[c][i] = src->ar_coeffs_uv[c][i];
+        }
+    }
+#endif
+
+    // Upload the actual planes
+    struct pl_plane_data data[3] = {
+        {
+            // Y plane
+            .type           = PL_FMT_UNORM,
+            .width          = width,
+            .height         = height,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[0],
+            .component_size = {bytes * 8},
+            .component_map  = {0},
+        }, {
+            // U plane
+            .type           = PL_FMT_UNORM,
+            .width          = width >> sub_x,
+            .height         = height >> sub_y,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[1],
+            .component_size = {bytes * 8},
+            .component_map  = {1},
+        }, {
+            // V plane
+            .type           = PL_FMT_UNORM,
+            .width          = width >> sub_x,
+            .height         = height >> sub_y,
+            .pixel_stride   = bytes,
+            .row_stride     = dav1d_pic->stride[1],
+            .component_size = {bytes * 8},
+            .component_map  = {2},
+        },
+    };
+
+    bool ok = true;
+
+    for (int i = 0; i < image->num_planes; i++) {
+        if (settings->zerocopy) {
+            const struct pl_buf *buf = dav1d_pic->allocator_data;
+            assert(buf);
+            data[i].buf = buf;
+            data[i].buf_offset = (uintptr_t) dav1d_pic->data[i] - (uintptr_t) buf->data;
+        } else {
+            data[i].pixels = dav1d_pic->data[i];
+        }
+
+        ok &= pl_upload_plane(rd_priv_ctx->gpu, &image->planes[i], &rd_priv_ctx->plane_tex[i], &data[i]);
+    }
+
+    // Apply the correct chroma plane shift. This has to be done after pl_upload_plane
+#if PL_API_VER >= 67
+    pl_image_set_chroma_location(image, chroma_loc);
+#else
+    pl_chroma_location_offset(chroma_loc, &image->planes[1].shift_x, &image->planes[1].shift_y);
+    pl_chroma_location_offset(chroma_loc, &image->planes[2].shift_x, &image->planes[2].shift_y);
+#endif
+
+    if (!ok) {
+        fprintf(stderr, "Failed uploading planes!\n");
+        *image = (struct pl_image) {0};
+    }
+
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return !ok;
+}
+
+// Align to power of 2
+#define ALIGN2(x, align) (((x) + (align) - 1) & ~((align) - 1))
+
+static int placebo_alloc_pic(Dav1dPicture *const p, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+    int ret = DAV1D_ERR(ENOMEM);
+
+    // Copied from dav1d_default_picture_alloc
+    const int hbd = p->p.bpc > 8;
+    const int aligned_w = ALIGN2(p->p.w, 128);
+    const int aligned_h = ALIGN2(p->p.h, 128);
+    const int has_chroma = p->p.layout != DAV1D_PIXEL_LAYOUT_I400;
+    const int ss_ver = p->p.layout == DAV1D_PIXEL_LAYOUT_I420;
+    const int ss_hor = p->p.layout != DAV1D_PIXEL_LAYOUT_I444;
+    p->stride[0] = aligned_w << hbd;
+    p->stride[1] = has_chroma ? (aligned_w >> ss_hor) << hbd : 0;
+
+    // Align strides up to multiples of the GPU performance hints
+    p->stride[0] = ALIGN2(p->stride[0], gpu->limits.align_tex_xfer_stride);
+    p->stride[1] = ALIGN2(p->stride[1], gpu->limits.align_tex_xfer_stride);
+
+    // Aligning offsets to 4 also implicity aligns to the texel size (1 or 2)
+    size_t off_align = ALIGN2(gpu->limits.align_tex_xfer_offset, 4);
+    const size_t y_sz = ALIGN2(p->stride[0] * aligned_h, off_align);
+    const size_t uv_sz = ALIGN2(p->stride[1] * (aligned_h >> ss_ver), off_align);
+
+    // The extra DAV1D_PICTURE_ALIGNMENTs are to brute force plane alignment,
+    // even in the case that the driver gives us insane alignments
+    const size_t pic_size = y_sz + 2 * uv_sz;
+    const size_t total_size = pic_size + DAV1D_PICTURE_ALIGNMENT * 4;
+
+    // Validate size limitations
+    if (total_size > gpu->limits.max_xfer_size) {
+        printf("alloc of %zu bytes exceeds limits\n", total_size);
+        goto err;
+    }
+
+    const struct pl_buf *buf = pl_buf_create(gpu, &(struct pl_buf_params) {
+        .type = PL_BUF_TEX_TRANSFER,
+        .host_mapped = true,
+        .size = total_size,
+        .memory_type = PL_BUF_MEM_HOST,
+        .user_data = p,
+    });
+
+    if (!buf) {
+        printf("alloc of GPU mapped buffer failed\n");
+        goto err;
+    }
+
+    assert(buf->data);
+    uintptr_t base = (uintptr_t) buf->data, data[3];
+    data[0] = ALIGN2(base, DAV1D_PICTURE_ALIGNMENT);
+    data[1] = ALIGN2(data[0] + y_sz, DAV1D_PICTURE_ALIGNMENT);
+    data[2] = ALIGN2(data[1] + uv_sz, DAV1D_PICTURE_ALIGNMENT);
+
+    // Sanity check offset alignment for the sake of debugging
+    if (data[0] - base != ALIGN2(data[0] - base, off_align) ||
+        data[1] - base != ALIGN2(data[1] - base, off_align) ||
+        data[2] - base != ALIGN2(data[2] - base, off_align))
+    {
+        printf("GPU buffer horribly misaligned, expect slowdown!\n");
+    }
+
+    p->allocator_data = (void *) buf;
+    p->data[0] = (void *) data[0];
+    p->data[1] = (void *) data[1];
+    p->data[2] = (void *) data[2];
+    ret = 0;
+
+    // fall through
+err:
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return ret;
+}
+
+static void placebo_release_pic(Dav1dPicture *pic, void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+    assert(pic->allocator_data);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+    const struct pl_gpu *gpu = rd_priv_ctx->gpu;
+    pl_buf_destroy(gpu, (const struct pl_buf **) &pic->allocator_data);
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+#ifdef HAVE_PLACEBO_VULKAN
+const Dav1dPlayRenderInfo rdr_placebo_vk = {
+    .name = "placebo-vk",
+    .create_renderer = placebo_renderer_create_vk,
+    .destroy_renderer = placebo_renderer_destroy,
+    .render = placebo_render,
+    .update_frame = placebo_upload_image,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+    .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+#endif
+
+#ifdef HAVE_PLACEBO_OPENGL
+const Dav1dPlayRenderInfo rdr_placebo_gl = {
+    .name = "placebo-gl",
+    .create_renderer = placebo_renderer_create_gl,
+    .destroy_renderer = placebo_renderer_destroy,
+    .render = placebo_render,
+    .update_frame = placebo_upload_image,
+    .alloc_pic = placebo_alloc_pic,
+    .release_pic = placebo_release_pic,
+
+# if PL_API_VER >= 63
+    .supports_gpu_grain = 1,
+# endif
+};
+#else
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
+
+#else
+const Dav1dPlayRenderInfo rdr_placebo_vk = { NULL };
+const Dav1dPlayRenderInfo rdr_placebo_gl = { NULL };
+#endif
diff --git a/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c b/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c
new file mode 100644
index 000000000..078d61349
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/examples/dp_renderer_sdl.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright © 2020, VideoLAN and dav1d authors
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "dp_renderer.h"
+
+#include <assert.h>
+
+/**
+ * Renderer context for SDL
+ */
+typedef struct renderer_priv_ctx
+{
+    // SDL window
+    SDL_Window *win;
+    // SDL renderer
+    SDL_Renderer *renderer;
+    // Lock protecting access to the texture
+    SDL_mutex *lock;
+    // Texture to render
+    SDL_Texture *tex;
+} Dav1dPlayRendererPrivateContext;
+
+static void *sdl_renderer_create()
+{
+    SDL_Window *win = dp_create_sdl_window(0);
+    if (win == NULL)
+        return NULL;
+
+    // Alloc
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = malloc(sizeof(Dav1dPlayRendererPrivateContext));
+    if (rd_priv_ctx == NULL) {
+        return NULL;
+    }
+    rd_priv_ctx->win = win;
+
+    // Create renderer
+    rd_priv_ctx->renderer = SDL_CreateRenderer(win, -1, SDL_RENDERER_ACCELERATED);
+    // Set scale quality
+    SDL_SetHint(SDL_HINT_RENDER_SCALE_QUALITY, "linear");
+
+    // Create Mutex
+    rd_priv_ctx->lock = SDL_CreateMutex();
+    if (rd_priv_ctx->lock == NULL) {
+        fprintf(stderr, "SDL_CreateMutex failed: %s\n", SDL_GetError());
+        free(rd_priv_ctx);
+        return NULL;
+    }
+
+    rd_priv_ctx->tex = NULL;
+
+    return rd_priv_ctx;
+}
+
+static void sdl_renderer_destroy(void *cookie)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_DestroyRenderer(rd_priv_ctx->renderer);
+    SDL_DestroyMutex(rd_priv_ctx->lock);
+    free(rd_priv_ctx);
+}
+
+static void sdl_render(void *cookie, const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (rd_priv_ctx->tex == NULL) {
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return;
+    }
+
+    // Display the frame
+    SDL_RenderClear(rd_priv_ctx->renderer);
+    SDL_RenderCopy(rd_priv_ctx->renderer, rd_priv_ctx->tex, NULL, NULL);
+    SDL_RenderPresent(rd_priv_ctx->renderer);
+
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+}
+
+static int sdl_update_texture(void *cookie, Dav1dPicture *dav1d_pic,
+                              const Dav1dPlaySettings *settings)
+{
+    Dav1dPlayRendererPrivateContext *rd_priv_ctx = cookie;
+    assert(rd_priv_ctx != NULL);
+
+    SDL_LockMutex(rd_priv_ctx->lock);
+
+    if (dav1d_pic == NULL) {
+        rd_priv_ctx->tex = NULL;
+        SDL_UnlockMutex(rd_priv_ctx->lock);
+        return 0;
+    }
+
+    int width = dav1d_pic->p.w;
+    int height = dav1d_pic->p.h;
+    int tex_w = width;
+    int tex_h = height;
+
+    enum Dav1dPixelLayout dav1d_layout = dav1d_pic->p.layout;
+
+    if (DAV1D_PIXEL_LAYOUT_I420 != dav1d_layout || dav1d_pic->p.bpc != 8) {
+        fprintf(stderr, "Unsupported pixel format, only 8bit 420 supported so far.\n");
+        exit(50);
+    }
+
+    SDL_Texture *texture = rd_priv_ctx->tex;
+    if (texture != NULL) {
+        SDL_QueryTexture(texture, NULL, NULL, &tex_w, &tex_h);
+        if (tex_w != width || tex_h != height) {
+            SDL_DestroyTexture(texture);
+            texture = NULL;
+        }
+    }
+
+    if (texture == NULL) {
+        texture = SDL_CreateTexture(rd_priv_ctx->renderer, SDL_PIXELFORMAT_IYUV,
+            SDL_TEXTUREACCESS_STREAMING, width, height);
+    }
+
+    SDL_UpdateYUVTexture(texture, NULL,
+        dav1d_pic->data[0], (int)dav1d_pic->stride[0], // Y
+        dav1d_pic->data[1], (int)dav1d_pic->stride[1], // U
+        dav1d_pic->data[2], (int)dav1d_pic->stride[1]  // V
+        );
+
+    rd_priv_ctx->tex = texture;
+    SDL_UnlockMutex(rd_priv_ctx->lock);
+    return 0;
+}
+
+const Dav1dPlayRenderInfo rdr_sdl = {
+    .name = "sdl",
+    .create_renderer = sdl_renderer_create,
+    .destroy_renderer = sdl_renderer_destroy,
+    .render = sdl_render,
+    .update_frame = sdl_update_texture
+};
diff --git a/ffmpeg/JNI/dav1d/examples/meson.build b/ffmpeg/JNI/dav1d/examples/meson.build
index bad1d902e..50e097a8d 100644
--- a/ffmpeg/JNI/dav1d/examples/meson.build
+++ b/ffmpeg/JNI/dav1d/examples/meson.build
@@ -35,28 +35,40 @@ endif
 # dav1d player sources
 dav1dplay_sources = files(
     'dav1dplay.c',
+    'dp_fifo.c',
+    'dp_renderer_placebo.c',
+    'dp_renderer_sdl.c',
 )
 
 sdl2_dependency = dependency('sdl2', version: '>= 2.0.1', required: true)
 
 if sdl2_dependency.found()
+    dav1dplay_deps = [sdl2_dependency]
+    dav1dplay_cflags = []
+
     placebo_dependency = dependency('libplacebo', version: '>= 1.18.0', required: false)
-    vulkan_dependency = dependency('vulkan', required: false)
-    sdl_has_vulkan = cc.has_header('SDL_vulkan.h', dependencies: [sdl2_dependency])
-    cflag_placebo = []
-    deps_placebo = []
-    if placebo_dependency.found() and vulkan_dependency.found() and sdl_has_vulkan
-        cflag_placebo += '-DHAVE_PLACEBO_VULKAN=1'
-        deps_placebo = [vulkan_dependency, placebo_dependency]
+
+    if placebo_dependency.found()
+        dav1dplay_deps += placebo_dependency
+        dav1dplay_cflags += '-DHAVE_PLACEBO'
+
+        # If libplacebo is found, we might be able to use Vulkan
+        # with it, in which case we need the Vulkan library too.
+        vulkan_dependency = dependency('vulkan', required: false)
+        if vulkan_dependency.found()
+            dav1dplay_deps += vulkan_dependency
+            dav1dplay_cflags += '-DHAVE_VULKAN'
+        endif
     endif
+
     dav1dplay = executable('dav1dplay',
         dav1dplay_sources,
         rev_target,
 
         link_with : [libdav1d, dav1d_input_objs],
         include_directories : [dav1d_inc_dirs],
-        dependencies : [getopt_dependency, sdl2_dependency, deps_placebo],
+        dependencies : [getopt_dependency, dav1dplay_deps],
         install : true,
-        c_args : cflag_placebo,
+        c_args : dav1dplay_cflags,
     )
 endif
diff --git a/ffmpeg/JNI/dav1d/gcovr.cfg b/ffmpeg/JNI/dav1d/gcovr.cfg
new file mode 100644
index 000000000..f768de8a6
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/gcovr.cfg
@@ -0,0 +1,3 @@
+exclude = .*/tests/.*
+exclude = .*/tools/.*
+exclude = .*/include/common/dump.h
diff --git a/ffmpeg/JNI/dav1d/include/common/attributes.h b/ffmpeg/JNI/dav1d/include/common/attributes.h
index d5c4ce50b..0683b5044 100644
--- a/ffmpeg/JNI/dav1d/include/common/attributes.h
+++ b/ffmpeg/JNI/dav1d/include/common/attributes.h
@@ -159,4 +159,8 @@ static inline int clzll(const unsigned long long mask) {
 }
 #endif /* !_MSC_VER */
 
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
 #endif /* DAV1D_COMMON_ATTRIBUTES_H */
diff --git a/ffmpeg/JNI/dav1d/include/common/mem.h b/ffmpeg/JNI/dav1d/include/common/mem.h
index a633b2ae9..74cdaf23a 100644
--- a/ffmpeg/JNI/dav1d/include/common/mem.h
+++ b/ffmpeg/JNI/dav1d/include/common/mem.h
@@ -37,13 +37,13 @@
 #include "common/attributes.h"
 
 /*
- * Allocate 32-byte aligned memory. The return value can be released
- * by calling the standard free() function.
+ * Allocate align-byte aligned memory. The return value can be released
+ * by calling the dav1d_free_aligned() function.
  */
 static inline void *dav1d_alloc_aligned(size_t sz, size_t align) {
+    assert(!(align & (align - 1)));
 #ifdef HAVE_POSIX_MEMALIGN
     void *ptr;
-    assert(!(align & (align - 1)));
     if (posix_memalign(&ptr, align, sz)) return NULL;
     return ptr;
 #elif defined(HAVE_ALIGNED_MALLOC)
diff --git a/ffmpeg/JNI/dav1d/meson.build b/ffmpeg/JNI/dav1d/meson.build
index 730229bb6..d5366f9a7 100644
--- a/ffmpeg/JNI/dav1d/meson.build
+++ b/ffmpeg/JNI/dav1d/meson.build
@@ -1,4 +1,4 @@
-# Copyright © 2018-2019, VideoLAN and dav1d authors
+# Copyright © 2018-2020, VideoLAN and dav1d authors
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -23,14 +23,14 @@
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 project('dav1d', ['c'],
-    version: '0.6.0',
+    version: '0.7.1',
     default_options: ['c_std=c99',
                       'warning_level=2',
                       'buildtype=release',
                       'b_ndebug=if-release'],
     meson_version: '>= 0.47.0')
 
-dav1d_soname_version       = '4.0.0'
+dav1d_soname_version       = '4.0.2'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@@ -196,10 +196,10 @@ else
     getopt_dependency = []
 endif
 
-if cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
-    cdata.set('HAVE_POSIX_MEMALIGN', 1)
-elif cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
+if cc.has_function('_aligned_malloc', prefix : '#include <malloc.h>', args : test_args)
     cdata.set('HAVE_ALIGNED_MALLOC', 1)
+elif cc.has_function('posix_memalign', prefix : '#include <stdlib.h>', args : test_args)
+    cdata.set('HAVE_POSIX_MEMALIGN', 1)
 elif cc.has_function('memalign', prefix : '#include <malloc.h>', args : test_args)
     cdata.set('HAVE_MEMALIGN', 1)
 endif
@@ -362,20 +362,11 @@ if cc.symbols_have_underscore_prefix()
     cdata_asm.set10('PREFIX', true)
 endif
 
-# Generate config.h
-config_h_target = configure_file(output: 'config.h', configuration: cdata)
-
-
-
 #
 # ASM specific stuff
 #
 if is_asm_enabled and host_machine.cpu_family().startswith('x86')
 
-    # Generate config.asm
-    config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm)
-
-
     # NASM compiler support
 
     nasm = find_program('nasm')
@@ -390,14 +381,22 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
 
         out = nasm_r.stdout().strip().split()
         if out[1].to_lower() == 'version'
-            if out[2].version_compare('<2.14')
-                error('nasm 2.14 or later is required, found nasm @0@'.format(out[2]))
+            if out[2].version_compare('<2.13.02')
+                error('nasm 2.13.02 or later is required, found nasm @0@'.format(out[2]))
+            elif out[2].version_compare('<2.14') and get_option('enable_avx512')
+                error('nasm 2.14 or later is required for AVX-512 asm.\n' +
+                       'AVX-512 asm can be disabled with \'-Denable_avx512=false\'')
             endif
+            cdata.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
+            cdata_asm.set10('HAVE_AVX512ICL', get_option('enable_avx512'))
         else
             error('unexpected nasm version string: @0@'.format(nasm_r.stdout()))
         endif
     endif
 
+    # Generate config.asm
+    config_asm_target = configure_file(output: 'config.asm', output_format: 'nasm', configuration: cdata_asm)
+
     if host_machine.system() == 'windows'
         nasm_format = 'win'
     elif host_machine.system() == 'darwin'
@@ -416,7 +415,7 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
         depfile: '@BASENAME@.obj.ndep',
         arguments: [
             '-f', nasm_format,
-            '-I', '@0@/src/'.format(meson.current_source_dir()),
+            '-I', '@0@/src/'.format(dav1d_src_root),
             '-I', '@0@/'.format(meson.current_build_dir()),
             '-MQ', '@OUTPUT@', '-MF', '@DEPFILE@',
             '@EXTRA_ARGS@',
@@ -426,6 +425,10 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
 endif
 
 
+# Generate config.h
+config_h_target = configure_file(output: 'config.h', configuration: cdata)
+
+
 
 #
 # Include subdir meson.build files
diff --git a/ffmpeg/JNI/dav1d/meson_options.txt b/ffmpeg/JNI/dav1d/meson_options.txt
index cdd27c2df..37bd08433 100644
--- a/ffmpeg/JNI/dav1d/meson_options.txt
+++ b/ffmpeg/JNI/dav1d/meson_options.txt
@@ -10,6 +10,11 @@ option('enable_asm',
     value: true,
     description: 'Build asm files, if available')
 
+option('enable_avx512',
+    type: 'boolean',
+    value: true,
+    description: 'Build AVX-512 asm files, requires nasm 2.14')
+
 option('enable_tools',
     type: 'boolean',
     value: true,
diff --git a/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson b/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson
new file mode 100644
index 000000000..a25ea4325
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/package/crossfiles/aarch64-android.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'aarch64-linux-android21-clang'
+cpp = 'aarch64-linux-android21-clang++'
+ar = 'aarch64-linux-android-ar'
+strip = 'aarch64-linux-android-strip'
+pkgconfig = 'pkg-config'
+windres = 'aarch64-linux-android-windres'
+
+[properties]
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'android'
+cpu_family = 'aarch64'
+endian = 'little'
+cpu = 'aarch64'
diff --git a/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson b/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson
new file mode 100644
index 000000000..dd07d98ea
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/package/crossfiles/arm-android.meson
@@ -0,0 +1,16 @@
+[binaries]
+c = 'armv7a-linux-androideabi16-clang'
+cpp = 'armv7a-linux-androideabi16-clang++'
+ar = 'arm-linux-androideabi-ar'
+strip = 'arm-linux-androideabi-strip'
+pkgconfig = 'pkg-config'
+windres = 'arm-linux-androideabi-windres'
+
+[properties]
+needs_exe_wrapper = true
+
+[host_machine]
+system = 'android'
+cpu_family = 'arm'
+endian = 'little'
+cpu = 'arm'
diff --git a/ffmpeg/JNI/dav1d/src/arm/32/ipred.S b/ffmpeg/JNI/dav1d/src/arm/32/ipred.S
index f26e55f77..d850a0cef 100644
--- a/ffmpeg/JNI/dav1d/src/arm/32/ipred.S
+++ b/ffmpeg/JNI/dav1d/src/arm/32/ipred.S
@@ -29,11 +29,11 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
-// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int a,
-//                        const int max_width, const int max_height);
-function ipred_dc_128_neon, export=1
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
         push            {r4, lr}
         ldr             r4,  [sp, #8]
         clz             r3,  r3
@@ -107,11 +107,11 @@ L(ipred_dc_128_tbl):
         pop             {r4, pc}
 endfunc
 
-// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
-//                   const pixel *const topleft,
-//                   const int width, const int height, const int a,
-//                   const int max_width, const int max_height);
-function ipred_v_neon, export=1
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
         push            {r4, lr}
         ldr             lr,  [sp, #8]
         clz             r3,  r3
@@ -189,11 +189,11 @@ L(ipred_v_tbl):
         pop             {r4, pc}
 endfunc
 
-// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
-//                   const pixel *const topleft,
-//                   const int width, const int height, const int a,
-//                   const int max_width, const int max_height);
-function ipred_h_neon, export=1
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
         push            {r4-r5, lr}
         ldr             r4,  [sp, #12]
         clz             r3,  r3
@@ -297,11 +297,11 @@ L(ipred_h_tbl):
         pop             {r4-r5, pc}
 endfunc
 
-// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int a,
-//                        const int max_width, const int max_height);
-function ipred_dc_top_neon, export=1
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
         push            {r4-r5, lr}
         ldr             r4,  [sp, #12]
         clz             r3,  r3
@@ -418,11 +418,11 @@ L(ipred_dc_top_tbl):
         pop             {r4-r5, pc}
 endfunc
 
-// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
-//                         const pixel *const topleft,
-//                         const int width, const int height, const int a,
-//                         const int max_width, const int max_height);
-function ipred_dc_left_neon, export=1
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
         push            {r4-r5, lr}
         ldr             r4,  [sp, #12]
         sub             r2,  r2,  r4
@@ -556,11 +556,11 @@ L(ipred_dc_left_w64):
         pop             {r4-r5, pc}
 endfunc
 
-// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
-//                    const pixel *const topleft,
-//                    const int width, const int height, const int a,
-//                    const int max_width, const int max_height);
-function ipred_dc_neon, export=1
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
         push            {r4-r6, lr}
         ldr             r4,  [sp, #16]
         sub             r2,  r2,  r4
@@ -765,10 +765,6 @@ L(ipred_dc_h64):
         vpadd.u16       d0,  d0
         bx              r3
 L(ipred_dc_w64):
-        vmov.8          q1,  q0
-        vmov.8          q2,  q0
-        vmov.8          q3,  q0
-2:
         add             r2,  r2,  #1
         vld1.8          {d2,  d3,  d4,  d5},  [r2]!
         vadd.s16        d0,  d0,  d30
diff --git a/ffmpeg/JNI/dav1d/src/arm/32/itx.S b/ffmpeg/JNI/dav1d/src/arm/32/itx.S
new file mode 100644
index 000000000..867eb194d
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/src/arm/32/itx.S
@@ -0,0 +1,3386 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob);
+
+// Most of the functions use the following register layout:
+// r0-r3   external parameters
+// r4      function pointer to first transform
+// r5      function pointer to second transform
+// r6      output parameter for helper function
+// r7      input parameter for helper function
+// r8      input stride for helper function
+// r9      scratch variable for helper functions
+// r10-r11 pointer to list of eob thresholds, eob threshold value,
+//         scratch variables within helper functions (backed up)
+
+// The SIMD registers most often use the following layout:
+// d0-d3   multiplication coefficients
+// d4-d7   scratch registers
+// d8-d15  unused in some transforms, used for scratch registers in others
+// d16-v31 inputs/outputs of transforms
+
+// Potential further optimizations, that are left unimplemented for now:
+// - Trying to keep multiplication coefficients in registers across multiple
+//   transform functions. (The register layout is designed to potentially
+//   allow this.)
+// - Use a simplified version of the transforms themselves for cases where
+//   we know a significant number of inputs are zero. E.g. if the eob value
+//   indicates only a quarter of input values are set, for idct16 and up,
+//   a significant amount of calculation can be skipped, at the cost of more
+//   code duplication and special casing.
+
+const idct_coeffs, align=4
+        // idct4
+        .short          2896, 2896*8, 1567, 3784
+        // idct8
+        .short          799, 4017, 3406, 2276
+        // idct16
+        .short          401, 4076, 3166, 2598
+        .short          1931, 3612, 3920, 1189
+        // idct32
+        .short          201, 4091, 3035, 2751
+        .short          1751, 3703, 3857, 1380
+        .short          995, 3973, 3513, 2106
+        .short          2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+        .short          101*8, 4095*8, 2967*8, -2824*8
+        .short          1660*8, 3745*8, 3822*8, -1474*8
+        .short          4076, 401, 4017, 799
+
+        .short          4036*8, -700*8, 2359*8, 3349*8
+        .short          3461*8, -2191*8, 897*8, 3996*8
+        .short          -3166, -2598, -799, -4017
+
+        .short          501*8, 4065*8, 3229*8, -2520*8
+        .short          2019*8, 3564*8, 3948*8, -1092*8
+        .short          3612, 1931, 2276, 3406
+
+        .short          4085*8, -301*8, 2675*8, 3102*8
+        .short          3659*8, -1842*8, 1285*8, 3889*8
+        .short          -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+        // .h[4-5] can be interpreted as .s[2]
+        .short          1321, 3803, 2482, 3344, 3344, 0
+endconst
+
+const iadst8_coeffs, align=4
+        .short          4076, 401, 3612, 1931
+        .short          2598, 3166, 1189, 3920
+        // idct_coeffs
+        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
+endconst
+
+const iadst16_coeffs, align=4
+        .short          4091, 201, 3973, 995
+        .short          3703, 1751, 3290, 2440
+        .short          2751, 3035, 2106, 3513
+        .short          1380, 3857, 601, 4052
+endconst
+
+.macro vmull_vmlal d0, s0, s1, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlal.s16       \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlal_8h d0, d1, s0, s1, s2, s3, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlal.s16       \d0, \s2, \c1
+        vmull.s16       \d1, \s1, \c0
+        vmlal.s16       \d1, \s3, \c1
+.endm
+
+.macro vmull_vmlsl d0, s0, s1, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlsl.s16       \d0, \s1, \c1
+.endm
+
+.macro vmull_vmlsl_8h d0, d1, s0, s1, s2, s3, c0, c1
+        vmull.s16       \d0, \s0, \c0
+        vmlsl.s16       \d0, \s2, \c1
+        vmull.s16       \d1, \s1, \c0
+        vmlsl.s16       \d1, \s3, \c1
+.endm
+
+.macro vrshrn_8h d0, d1, s0, s1, shift
+        vrshrn.i32      \d0, \s0, \shift
+        vrshrn.i32      \d1, \s1, \shift
+.endm
+
+.macro scale_input c, r0, r1, r2 r3, r4, r5, r6, r7
+        vqrdmulh.s16    \r0, \r0, \c
+        vqrdmulh.s16    \r1, \r1, \c
+.ifnb \r2
+        vqrdmulh.s16    \r2, \r2, \c
+        vqrdmulh.s16    \r3, \r3, \c
+.endif
+.ifnb \r4
+        vqrdmulh.s16    \r4, \r4, \c
+        vqrdmulh.s16    \r5, \r5, \c
+        vqrdmulh.s16    \r6, \r6, \c
+        vqrdmulh.s16    \r7, \r7, \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src, shiftbits=4
+.ifnb \load
+        vld1.8          {\load},  [\src, :64], r1
+.endif
+.ifnb \shift
+        vrshr.s16       \shift,  \shift,  #\shiftbits
+.endif
+.ifnb \addsrc
+        vaddw.u8        \adddst, \adddst, \addsrc
+.endif
+.ifnb \narrowsrc
+        vqmovun.s16     \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        vst1.8          {\store},  [\dst, :64], r1
+.endif
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+        mov             \src, \dst
+        load_add_store  d2,  q8,    ,    ,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d3,  q9,    ,    ,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d4,  q10, d2,  q8,    ,    ,    , \dst, \src, \shiftbits
+        load_add_store  d5,  q11, d3,  q9,  q8,  d2,    , \dst, \src, \shiftbits
+        load_add_store  d6,  q12, d4,  q10, q9,  d3,  d2, \dst, \src, \shiftbits
+        load_add_store  d7,  q13, d5,  q11, q10, d4,  d3, \dst, \src, \shiftbits
+        load_add_store  d2,  q14, d6,  q12, q11, d5,  d4, \dst, \src, \shiftbits
+        load_add_store  d3,  q15, d7,  q13, q12, d6,  d5, \dst, \src, \shiftbits
+        load_add_store    ,     , d2,  q14, q13, d7,  d6, \dst, \src, \shiftbits
+        load_add_store    ,     , d3,  q15, q14, d2,  d7, \dst, \src, \shiftbits
+        load_add_store    ,     ,   ,     , q15, d3,  d2, \dst, \src, \shiftbits
+        load_add_store    ,     ,   ,     ,    ,   ,  d3, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src
+        mov             \src, \dst
+        load_add_store  d2,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store  d3,  q9,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store  d4,  q10, d2,  q8,    ,    ,    ,  \dst, \src
+        load_add_store  d5,  q11, d3,  q9,  q8,  d2,    ,  \dst, \src
+        load_add_store    ,     , d4,  q10, q9,  d3,  d2,  \dst, \src
+        load_add_store    ,     , d5,  q11, q10, d4,  d3,  \dst, \src
+        load_add_store    ,     ,   ,     , q11, d5,  d4,  \dst, \src
+        load_add_store    ,     ,   ,     ,    ,   ,  d5,  \dst, \src
+.endm
+.macro load_add_store4 load, shift, addsrc, adddst, narrowsrc, narrowdst, store, dst, src
+.ifnb \load
+        vld1.32         {\load[0]},  [\src, :32], r1
+.endif
+.ifnb \shift
+        vrshr.s16       \shift,  \shift,  #4
+.endif
+.ifnb \load
+        vld1.32         {\load[1]},  [\src, :32], r1
+.endif
+.ifnb \addsrc
+        vaddw.u8        \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+        vst1.32         {\store[0]},  [\dst, :32], r1
+.endif
+.ifnb \narrowsrc
+        vqmovun.s16     \narrowdst, \narrowsrc
+.endif
+.ifnb \store
+        vst1.32         {\store[1]},  [\dst, :32], r1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+        mov             \src, \dst
+        load_add_store4 d0,    ,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d1,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d2,  q9,  d0,  q8,    ,    ,    ,  \dst, \src
+        load_add_store4 d3,  q10, d1,  q9,  q8,  d0,    ,  \dst, \src
+        load_add_store4 d4,  q11, d2,  q10, q9,  d1,  d0,  \dst, \src
+        load_add_store4 d5,  q12, d3,  q11, q10, d2,  d1,  \dst, \src
+        load_add_store4 d6,  q13, d4,  q12, q11, d3,  d2,  \dst, \src
+        load_add_store4 d7,  q14, d5,  q13, q12, d4,  d3,  \dst, \src
+        load_add_store4   ,  q15, d6,  q14, q13, d5,  d4,  \dst, \src
+        load_add_store4   ,     , d7,  q15, q14, d6,  d5,  \dst, \src
+        load_add_store4   ,     ,   ,     , q15, d7,  d6,  \dst, \src
+        load_add_store4   ,     ,   ,     ,    ,   ,  d7,  \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+        mov             \src, \dst
+        load_add_store4 d0,    ,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d1,  q8,    ,    ,    ,    ,    ,  \dst, \src
+        load_add_store4 d2,  q9,  d0,  q8,    ,    ,    ,  \dst, \src
+        load_add_store4 d3,  q10, d1,  q9,  q8,  d0,    ,  \dst, \src
+        load_add_store4   ,  q11, d2,  q10, q9,  d1,  d0,  \dst, \src
+        load_add_store4   ,     , d3,  q11, q10, d2,  d1,  \dst, \src
+        load_add_store4   ,     ,   ,     , q11, d3,  d2,  \dst, \src
+        load_add_store4   ,     ,   ,     ,    ,   ,  d3,  \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+        cmp             r3,  #0
+        bne             1f
+        vmov.i16        d30, #0
+        movw            r12, #2896*8
+        vld1.16         {d16[]},  [r2, :16]
+        vdup.16         d0,  r12
+        vqrdmulh.s16    d16, d16, d0[0]
+        vst1.16         {d30[0]}, [r2, :16]
+.if (\w == 2*\h) || (2*\w == \h)
+        vqrdmulh.s16    d16, d16, d0[0]
+.endif
+.if \shift > 0
+        vrshr.s16       d16, d16, #\shift
+.endif
+        vqrdmulh.s16    d20, d16, d0[0]
+        mov             r3,  #\h
+        vrshr.s16       d16, d20, #4
+        vrshr.s16       d17, d20, #4
+        b               idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+1:
+        vld1.32         {d0[0]}, [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[0]}, [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+        subs            r3,  r3,  #4
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q10, q8,  d0
+        vqmovun.s16     d0,  q10
+        vaddw.u8        q11, q8,  d1
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vqmovun.s16     d1,  q11
+        vst1.32         {d0[1]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w8_neon
+1:
+        vld1.8          {d0}, [r0, :64], r1
+        vld1.8          {d1}, [r0, :64], r1
+        vld1.8          {d2}, [r0, :64], r1
+        vaddw.u8        q10, q8,  d0
+        vld1.8          {d3}, [r0, :64], r1
+        sub             r0,  r0,  r1, lsl #2
+        subs            r3,  r3,  #4
+        vaddw.u8        q11, q8,  d1
+        vqmovun.s16     d0,  q10
+        vaddw.u8        q12, q8,  d2
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q13, q8,  d3
+        vst1.8          {d0}, [r0, :64], r1
+        vqmovun.s16     d2,  q12
+        vst1.8          {d1}, [r0, :64], r1
+        vqmovun.s16     d3,  q13
+        vst1.8          {d2}, [r0, :64], r1
+        vst1.8          {d3}, [r0, :64], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w16_neon
+1:
+        vld1.8          {q0}, [r0, :128], r1
+        vld1.8          {q1}, [r0, :128], r1
+        vld1.8          {q2}, [r0, :128], r1
+        subs            r3,  r3,  #4
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vld1.8          {q3}, [r0, :128], r1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0}, [r0, :128], r1
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q1}, [r0, :128], r1
+        vst1.8          {q2}, [r0, :128], r1
+        vst1.8          {q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w32_neon
+1:
+        vld1.8          {q0, q1}, [r0, :128], r1
+        subs            r3,  r3,  #2
+        vld1.8          {q2, q3}, [r0, :128], r1
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  r1, lsl #1
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0, q1}, [r0, :128], r1
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q2, q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+function idct_dc_w64_neon
+        sub             r1,  r1,  #32
+1:
+        vld1.8          {q0, q1}, [r0, :128]!
+        subs            r3,  r3,  #1
+        vld1.8          {q2, q3}, [r0, :128]
+        vaddw.u8        q10, q8,  d0
+        vaddw.u8        q11, q8,  d1
+        vaddw.u8        q12, q8,  d2
+        vaddw.u8        q13, q8,  d3
+        sub             r0,  r0,  #32
+        vaddw.u8        q14, q8,  d4
+        vaddw.u8        q15, q8,  d5
+        vqmovun.s16     d0,  q10
+        vqmovun.s16     d1,  q11
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d2,  q12
+        vqmovun.s16     d3,  q13
+        vqmovun.s16     d4,  q14
+        vqmovun.s16     d5,  q15
+        vst1.8          {q0, q1}, [r0, :128]!
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q2, q3}, [r0, :128], r1
+        bgt             1b
+        bx              lr
+endfunc
+
+.macro iwht4
+        vadd.i16        d16, d16, d17
+        vsub.i16        d21, d18, d19
+        vsub.i16        d20, d16, d21
+        vshr.s16        d20, d20, #1
+        vsub.i16        d18, d20, d17
+        vsub.i16        d17, d20, d19
+        vadd.i16        d19, d21, d18
+        vsub.i16        d16, d16, d17
+.endm
+
+.macro idct_4h_x4 r0, r1, r2, r3
+        vmull_vmlal     q3,  \r1, \r3, d0[3], d0[2]
+        vmull_vmlsl     q2,  \r1, \r3, d0[2], d0[3]
+        vmull_vmlal     q1,  \r0, \r2, d0[0], d0[0]
+        vrshrn.i32      d6,  q3,  #12
+        vrshrn.i32      d7,  q2,  #12
+        vmull_vmlsl     q2,  \r0, \r2, d0[0], d0[0]
+        vrshrn.i32      d2,  q1,  #12
+        vrshrn.i32      d3,  q2,  #12
+        vqadd.s16       \r0, d2,  d6
+        vqsub.s16       \r3, d2,  d6
+        vqadd.s16       \r1, d3,  d7
+        vqsub.s16       \r2, d3,  d7
+.endm
+
+.macro idct_8h_x4 q0, q1, q2, q3, r0, r1, r2, r3, r4, r5, r6, r7
+        vmull_vmlal_8h  q6,  q7,  \r2, \r3, \r6, \r7, d0[3], d0[2]
+        vmull_vmlsl_8h  q4,  q5,  \r2, \r3, \r6, \r7, d0[2], d0[3]
+        vmull_vmlal_8h  q2,  q3,  \r0, \r1, \r4, \r5, d0[0], d0[0]
+        vrshrn_8h       d12, d13, q6,  q7,  #12
+        vrshrn_8h       d14, d15, q4,  q5,  #12
+        vmull_vmlsl_8h  q4,  q5,  \r0, \r1, \r4, \r5, d0[0], d0[0]
+        vrshrn_8h       d4,  d5,  q2,  q3,  #12
+        vrshrn_8h       d6,  d7,  q4,  q5,  #12
+        vqadd.s16       \q0, q2,  q6
+        vqsub.s16       \q3, q2,  q6
+        vqadd.s16       \q1, q3,  q7
+        vqsub.s16       \q2, q3,  q7
+.endm
+
+function inv_dct_4h_x4_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+        idct_4h_x4      d16, d17, d18, d19
+        bx              lr
+endfunc
+
+function inv_dct_8h_x4_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+        idct_8h_x4      q8,  q9,  q10, q11, d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+        movrel_local    r12, iadst4_coeffs
+        vld1.16         {d0, d1}, [r12, :128]
+
+        vsubl.s16       q1,  d16, d18
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d19, d0[2]
+        vmull.s16       q10, d17, d0[3]
+        vaddw.s16       q1,  q1,  d19
+        vmull.s16       q3,  d16, d0[2]
+        vmlsl.s16       q3,  d18, d0[0]
+        vmlsl.s16       q3,  d19, d0[1]
+
+        vadd.s32        q11, q2,  q3
+        vmul.s32        q1,  q1,  d1[0]
+        vadd.s32        q2,  q2,  q10
+        vadd.s32        q3,  q3,  q10
+        vsub.s32        q11, q11, q10
+
+        vrshrn.i32      \o0, q2,  #12
+        vrshrn.i32      \o2, q1,  #12
+        vrshrn.i32      \o1, q3,  #12
+        vrshrn.i32      \o3, q11, #12
+.endm
+
+function inv_adst_4h_x4_neon, export=1
+        iadst_4x4       d16, d17, d18, d19
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x4_neon, export=1
+        iadst_4x4       d19, d18, d17, d16
+        bx              lr
+endfunc
+
+.macro iadst_8x4 o0, o1, o2, o3, o4, o5, o6, o7
+        movrel_local    r12, iadst4_coeffs
+        vld1.16         {d0, d1}, [r12, :128]
+
+        vsubl.s16       q2,  d16, d20
+        vsubl.s16       q3,  d17, d21
+        vmull.s16       q4,  d16, d0[0]
+        vmlal.s16       q4,  d20, d0[1]
+        vmlal.s16       q4,  d22, d0[2]
+        vmull.s16       q5,  d17, d0[0]
+        vmlal.s16       q5,  d21, d0[1]
+        vmlal.s16       q5,  d23, d0[2]
+        vaddw.s16       q2,  q2,  d22
+        vaddw.s16       q3,  q3,  d23
+        vmull.s16       q6,  d16, d0[2]
+        vmlsl.s16       q6,  d20, d0[0]
+        vmlsl.s16       q6,  d22, d0[1]
+        vmull.s16       q7,  d17, d0[2]
+        vmlsl.s16       q7,  d21, d0[0]
+        vmlsl.s16       q7,  d23, d0[1]
+
+        vmul.s32        q10, q2,  d1[0]
+        vmul.s32        q11, q3,  d1[0]
+
+        vmull.s16       q2,  d18, d0[3]
+        vmull.s16       q3,  d19, d0[3]
+
+        vadd.s32        q8,  q4,  q2 // out0
+        vadd.s32        q9,  q5,  q3
+
+        vadd.s32        q4,  q4,  q6 // out3
+        vadd.s32        q5,  q5,  q7
+
+        vadd.s32        q6,  q6,  q2 // out1
+        vadd.s32        q7,  q7,  q3
+
+        vsub.s32        q4,  q4,  q2 // out3
+        vsub.s32        q5,  q5,  q3
+
+        vrshrn.i32      d20, q10, #12
+        vrshrn.i32      d21, q11, #12
+
+        vrshrn.i32      \o0, q8,  #12
+        vrshrn.i32      \o1, q9,  #12
+
+.ifc \o4, d18
+        vmov            q9,  q10
+.endif
+
+        vrshrn.i32      \o2, q6,  #12
+        vrshrn.i32      \o3, q7,  #12
+
+        vrshrn.i32      \o6, q4,  #12
+        vrshrn.i32      \o7, q5,  #12
+.endm
+
+function inv_adst_8h_x4_neon, export=1
+        iadst_8x4       d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+function inv_flipadst_8h_x4_neon, export=1
+        iadst_8x4       d22, d23, d20, d21, d18, d19, d16, d17
+        bx              lr
+endfunc
+
+function inv_identity_4h_x4_neon, export=1
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        vqrdmulh.s16    q2,  q8,  d0[0]
+        vqrdmulh.s16    q3,  q9,  d0[0]
+        vqadd.s16       q8,  q8,  q2
+        vqadd.s16       q9,  q9,  q3
+        bx              lr
+endfunc
+
+function inv_identity_8h_x4_neon, export=1
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        vqrdmulh.s16    q1,  q8,  d0[0]
+        vqrdmulh.s16    q2,  q9,  d0[0]
+        vqrdmulh.s16    q3,  q10, d0[0]
+        vqadd.s16       q8,  q8,  q1
+        vqrdmulh.s16    q1,  q11, d0[0]
+        vqadd.s16       q9,  q9,  q2
+        vqadd.s16       q10, q10, q3
+        vqadd.s16       q11, q11, q1
+        bx              lr
+endfunc
+
+.macro identity_8x4_shift1 r0, r1, r2, r3, c
+.irp i, \r0, \r1, \r2, \r3
+        vqrdmulh.s16    q1,  \i,  \c
+        vrhadd.s16      \i,  \i,  q1
+.endr
+.endm
+
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        vmov.i16        q15, #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q15}, [r2, :128]!
+
+        vshr.s16        q8,  q8,  #2
+        vshr.s16        q9,  q9,  #2
+
+        iwht4
+
+        vst1.16         {q15}, [r2, :128]!
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+
+        iwht4
+
+        vld1.32         {d0[]},  [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[]},  [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+
+        b               L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+        vmov.i16        q15, #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q15}, [r2, :128]!
+
+        blx             r4
+
+        vst1.16         {q15}, [r2, :128]!
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+
+        blx             r5
+
+        vld1.32         {d0[]},  [r0, :32], r1
+        vld1.32         {d0[1]}, [r0, :32], r1
+        vld1.32         {d1[]},  [r0, :32], r1
+        vld1.32         {d1[1]}, [r0, :32], r1
+        vrshr.s16       q8,  q8,  #4
+        vrshr.s16       q9,  q9,  #4
+
+L(itx_4x4_end):
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q8,  q8,  d0
+        vqmovun.s16     d0,  q8
+        vaddw.u8        q9,  q9,  d1
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vqmovun.s16     d1,  q9
+        vst1.32         {d0[1]}, [r0, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r0, :32], r1
+
+        pop             {r4-r5,pc}
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
+        push            {r4-r5,lr}
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        cmp             r3,  #0
+        bne             1f
+        vmov.i16        d30, #0
+        movw            r12, #2896*8
+        vld1.16         {d16[]},  [r2, :16]
+        vdup.16         d4,  r12
+        vst1.16         {d30[0]}, [r2, :16]
+        vqrdmulh.s16    d16, d16, d4[0]
+        vld1.32         {d0[0]},  [r0, :32], r1
+        vqrdmulh.s16    d20, d16, d4[0]
+        vld1.32         {d0[1]},  [r0, :32], r1
+        vrshr.s16       d16, d20, #4
+        vrshr.s16       d17, d20, #4
+        vld1.32         {d1[0]},  [r0, :32], r1
+        vmov            q9,  q8
+        vld1.32         {d1[1]}, [r0, :32], r1
+        b               L(itx_4x4_end)
+1:
+.endif
+        movrel_local    r4,  inv_\txfm1\()_4h_x4_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x4_neon
+        b               inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        idct_8h_x4      \q0, \q2, \q4, \q6, \r0, \r1, \r4, \r5, \r8, \r9, \r12, \r13
+
+        vmull_vmlsl_8h  q2,   q3,   \r2,  \r3,  \r14, \r15, d1[0], d1[1] // -> t4a
+        vmull_vmlal_8h  q4,   q5,   \r2,  \r3,  \r14, \r15, d1[1], d1[0] // -> t7a
+        vmull_vmlsl_8h  q6,   q7,   \r10, \r11, \r6,  \r7,  d1[2], d1[3] // -> t5a
+        vrshrn_8h       \r2,  \r3,  q2,   q3,   #12         // t4a
+        vrshrn_8h       \r14, \r15, q4,   q5,   #12         // t7a
+        vmull_vmlal_8h  q2,   q3,   \r10, \r11, \r6,  \r7,  d1[3], d1[2] // -> t6a
+        vrshrn_8h       \r6,  \r7,  q6,   q7,   #12         // t5a
+        vrshrn_8h       \r10, \r11, q2,   q3,   #12         // taa
+
+        vqadd.s16       q2,   \q1,  \q3 // t4
+        vqsub.s16       \q1,  \q1,  \q3 // t5a
+        vqadd.s16       q3,   \q7,  \q5 // t7
+        vqsub.s16       \q3,  \q7,  \q5 // t6a
+
+        vmull_vmlsl_8h  q4,   q5,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t5
+        vmull_vmlal_8h  q6,   q7,   \r6,  \r7,  \r2,  \r3,  d0[0], d0[0] // -> t6
+        vrshrn_8h       d8,   d9,   q4,   q5,  #12 // t5
+        vrshrn_8h       d10,  d11,  q6,   q7,  #12 // t6
+
+        vqsub.s16       \q7,  \q0,  q3 // out7
+        vqadd.s16       \q0,  \q0,  q3 // out0
+        vqadd.s16       \q1,  \q2,  q5 // out1
+        vqsub.s16       q6,   \q2,  q5 // out6
+        vqadd.s16       \q2,  \q4,  q4 // out2
+        vqsub.s16       \q5,  \q4,  q4 // out5
+        vqadd.s16       \q3,  \q6,  q2 // out3
+        vqsub.s16       \q4,  \q6,  q2 // out4
+        vmov            \q6,  q6       // out6
+.endm
+
+.macro idct_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+        idct_4h_x4      \r0, \r2, \r4, \r6
+
+        vmull_vmlsl     q1,   \r1,  \r7, d1[0], d1[1] // -> t4a
+        vmull_vmlal     q2,   \r1,  \r7, d1[1], d1[0] // -> t7a
+        vmull_vmlsl     q3,   \r5,  \r3, d1[2], d1[3] // -> t5a
+        vrshrn.i32      \r1,  q1,   #12               // t4a
+        vmull_vmlal     q1,   \r5,  \r3, d1[3], d1[2] // -> t6a
+        vrshrn.i32      \r7,  q2,   #12               // t7a
+        vrshrn.i32      \r3,  q3,   #12               // t5a
+        vrshrn.i32      \r5,  q1,   #12               // taa
+
+        vqadd.s16       d2,   \r1,  \r3 // t4
+        vqsub.s16       \r1,  \r1,  \r3 // t5a
+        vqadd.s16       d3,   \r7,  \r5 // t7
+        vqsub.s16       \r3,  \r7,  \r5 // t6a
+
+        vmull_vmlsl     q2,   \r3,  \r1, d0[0], d0[0] // -> t5
+        vmull_vmlal     q3,   \r3,  \r1, d0[0], d0[0] // -> t6
+        vrshrn.i32      d4,   q2,   #12               // t5
+        vrshrn.i32      d5,   q3,   #12               // t6
+
+        vqsub.s16       \r7,  \r0,  d3 // out7
+        vqadd.s16       \r0,  \r0,  d3 // out0
+        vqadd.s16       \r1,  \r2,  d5 // out1
+        vqsub.s16       d6,   \r2,  d5 // out6
+        vqadd.s16       \r2,  \r4,  d4 // out2
+        vqsub.s16       \r5,  \r4,  d4 // out5
+        vqadd.s16       \r3,  \r6,  d2 // out3
+        vqsub.s16       \r4,  \r6,  d2 // out4
+        vmov            \r6,  d6       // out6
+.endm
+
+function inv_dct_8h_x8_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0}, [r12, :128]
+        idct_8h_x8      q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_dct_4h_x8_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0}, [r12, :128]
+        idct_4h_x8      d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+.macro iadst_8h_x8 q0, q1, q2, q3, q4, q5, q6, q7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        movrel_local    r12, iadst8_coeffs
+        vld1.16         {d0, d1, d2}, [r12, :64]
+
+        vmull_vmlal_8h  q2,  q3,  d30, d31, d16, d17, d0[0], d0[1]
+        vmull_vmlsl_8h  q4,  q5,  d30, d31, d16, d17, d0[1], d0[0]
+        vmull_vmlal_8h  q6,  q7,  d26, d27, d20, d21, d0[2], d0[3]
+        vrshrn_8h       d16, d17, q2,  q3,  #12  // t0a
+        vrshrn_8h       d30, d31, q4,  q5,  #12  // t1a
+        vmull_vmlsl_8h  q2,  q3,  d26, d27, d20, d21, d0[3], d0[2]
+        vmull_vmlal_8h  q4,  q5,  d22, d23, d24, d25, d1[0], d1[1]
+        vrshrn_8h       d20, d21, q6,  q7,  #12  // t2a
+        vrshrn_8h       d26, d27, q2,  q3,  #12  // t3a
+        vmull_vmlsl_8h  q6,  q7,  d22, d23, d24, d25, d1[1], d1[0]
+        vmull_vmlal_8h  q2,  q3,  d18, d19, d28, d29, d1[2], d1[3]
+        vrshrn_8h       d24, d25, q4,  q5,  #12  // t4a
+        vrshrn_8h       d22, d23, q6,  q7,  #12  // t5a
+        vmull_vmlsl_8h  q4,  q5,  d18, d19, d28, d29, d1[3], d1[2]
+        vrshrn_8h       d28, d29, q2,  q3,  #12  // t6a
+        vrshrn_8h       d18, d19, q4,  q5,  #12  // t7a
+
+        vqadd.s16       q2,  q8,  q12 // t0
+        vqsub.s16       q3,  q8,  q12 // t4
+        vqadd.s16       q4,  q15, q11 // t1
+        vqsub.s16       q5,  q15, q11 // t5
+        vqadd.s16       q6,  q10, q14 // t2
+        vqsub.s16       q7,  q10, q14 // t6
+        vqadd.s16       q10, q13, q9  // t3
+        vqsub.s16       q11, q13, q9  // t7
+
+        vmull_vmlal_8h  q8,  q9,  d6,  d7,  d10, d11, d2[3], d2[2]
+        vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[2], d2[3]
+        vmull_vmlsl_8h  q14, q15, d22, d23, d14, d15, d2[3], d2[2]
+
+        vrshrn_8h       d6,  d7,  q8,  q9,  #12  // t4a
+        vrshrn_8h       d10, d11, q12, q13, #12  // t5a
+
+        vmull_vmlal_8h  q8,  q9,  d22, d23, d14, d15, d2[2], d2[3]
+
+        vrshrn_8h       d14, d15, q14, q15, #12  // t6a
+        vrshrn_8h       d22, d23, q8,  q9,  #12  // t7a
+
+        vqadd.s16       \q0, q2,  q6  // out0
+        vqsub.s16       q2,  q2,  q6  // t2
+        vqadd.s16       \q7, q4,  q10 // out7
+        vqsub.s16       q4,  q4,  q10 // t3
+        vqneg.s16       \q7, \q7     // out7
+
+        vqadd.s16       \q1, q3,  q7  // out1
+        vqsub.s16       q3,  q3,  q7  // t6
+        vqadd.s16       \q6, q5,  q11 // out6
+        vqsub.s16       q5,  q5,  q11 // t7
+        vqneg.s16       \q1, \q1     // out1
+
+        vmull_vmlal_8h  q10, q11, d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out3 (q11 or q12)
+        vmull_vmlsl_8h  q6,  q7,  d4,  d5,  d8,  d9,  d2[0], d2[0] // -> out4 (q12 or q11)
+        vmull_vmlsl_8h  q12, q13, d6,  d7,  d10, d11, d2[0], d2[0] // -> out5 (q13 or q10)
+        vrshrn_8h       d4,  d5,  q10, q11, #12 // out3
+        vmull_vmlal_8h  q10, q11, d6,  d7,  d10, d11, d2[0], d2[0] // -> out2 (q10 or q13)
+        vrshrn_8h       d6,  d7,  q12, q13, #12 // out5
+        vrshrn_8h       \r4, \r5, q10, q11, #12 // out2 (q10 or q13)
+        vrshrn_8h       \r8, \r9, q6,  q7,  #12 // out4 (q12 or q11)
+
+        vqneg.s16       \q3, q2     // out3
+        vqneg.s16       \q5, q3     // out5
+.endm
+
+.macro iadst_4h_x8 r0, r1, r2, r3, r4, r5, r6, r7
+        movrel_local    r12, iadst8_coeffs
+        vld1.16         {d0, d1, d2}, [r12, :64]
+
+        vmull_vmlal     q2,  d23, d16, d0[0], d0[1]
+        vmull_vmlsl     q3,  d23, d16, d0[1], d0[0]
+        vmull_vmlal     q4,  d21, d18, d0[2], d0[3]
+        vrshrn.i32      d16, q2,  #12 // t0a
+        vrshrn.i32      d23, q3,  #12 // t1a
+        vmull_vmlsl     q5,  d21, d18, d0[3], d0[2]
+        vmull_vmlal     q6,  d19, d20, d1[0], d1[1]
+        vrshrn.i32      d18, q4,  #12 // t2a
+        vrshrn.i32      d21, q5,  #12 // t3a
+        vmull_vmlsl     q7,  d19, d20, d1[1], d1[0]
+        vmull_vmlal     q2,  d17, d22, d1[2], d1[3]
+        vrshrn.i32      d20, q6,  #12 // t4a
+        vrshrn.i32      d19, q7,  #12 // t5a
+        vmull_vmlsl     q3,  d17, d22, d1[3], d1[2]
+        vrshrn.i32      d22, q2,  #12 // t6a
+        vrshrn.i32      d17, q3,  #12 // t7a
+
+        vqadd.s16       d4,  d16, d20 // t0
+        vqsub.s16       d5,  d16, d20 // t4
+        vqadd.s16       d6,  d23, d19 // t1
+        vqsub.s16       d7,  d23, d19 // t5
+        vqadd.s16       d8,  d18, d22 // t2
+        vqsub.s16       d9,  d18, d22 // t6
+        vqadd.s16       d18, d21, d17 // t3
+        vqsub.s16       d19, d21, d17 // t7
+
+        vmull_vmlal     q8,  d5,  d7,  d2[3], d2[2]
+        vmull_vmlsl     q10, d5,  d7,  d2[2], d2[3]
+        vmull_vmlsl     q11, d19, d9,  d2[3], d2[2]
+
+        vrshrn.i32      d5,  q8,  #12 // t4a
+        vrshrn.i32      d7,  q10, #12 // t5a
+
+        vmull_vmlal     q8,  d19, d9,  d2[2], d2[3]
+
+        vrshrn.i32      d9,  q11, #12 // t6a
+        vrshrn.i32      d19, q8,  #12 // t7a
+
+        vqadd.s16       \r0, d4,  d8  // out0
+        vqsub.s16       d4,  d4,  d8  // t2
+        vqadd.s16       \r7, d6,  d18 // out7
+        vqsub.s16       d6,  d6,  d18 // t3
+        vqneg.s16       \r7, \r7      // out7
+
+        vqadd.s16       \r1, d5,  d9  // out1
+        vqsub.s16       d5,  d5,  d9  // t6
+        vqadd.s16       \r6, d7,  d19 // out6
+        vqsub.s16       d7,  d7,  d19 // t7
+        vqneg.s16       \r1, \r1      // out1
+
+        vmull_vmlal     q9,  d4,  d6,  d2[0], d2[0] // -> out3 (d19 or d20)
+        vmull_vmlsl     q4,  d4,  d6,  d2[0], d2[0] // -> out4 (d20 or d19)
+        vmull_vmlsl     q10, d5,  d7,  d2[0], d2[0] // -> out5 (d21 or d18)
+        vrshrn.i32      d4,  q9,  #12 // out3
+        vmull_vmlal     q9,  d5,  d7,  d2[0], d2[0] // -> out2 (d18 or d21)
+        vrshrn.i32      d5,  q10, #12 // out5
+        vrshrn.i32      \r2, q9,  #12 // out2 (d18 or d21)
+        vrshrn.i32      \r4, q4,  #12 // out4 (d20 or d19)
+
+        vqneg.s16       \r3, d4       // out3
+        vqneg.s16       \r5, d5       // out5
+.endm
+
+function inv_adst_8h_x8_neon, export=1
+        iadst_8h_x8     q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_flipadst_8h_x8_neon, export=1
+        iadst_8h_x8     q15, q14, q13, q12, q11, q10, q9,  q8,  d30, d31, d28, d29, d26, d27, d24, d25, d22, d23, d20, d21, d18, d19, d16, d17
+        bx              lr
+endfunc
+
+function inv_adst_4h_x8_neon, export=1
+        iadst_4h_x8     d16, d17, d18, d19, d20, d21, d22, d23
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x8_neon, export=1
+        iadst_4h_x8     d23, d22, d21, d20, d19, d18, d17, d16
+        bx              lr
+endfunc
+
+function inv_identity_8h_x8_neon, export=1
+        vqshl.s16       q8,  q8,  #1
+        vqshl.s16       q9,  q9,  #1
+        vqshl.s16       q10, q10, #1
+        vqshl.s16       q11, q11, #1
+        vqshl.s16       q12, q12, #1
+        vqshl.s16       q13, q13, #1
+        vqshl.s16       q14, q14, #1
+        vqshl.s16       q15, q15, #1
+        bx              lr
+endfunc
+
+function inv_identity_4h_x8_neon, export=1
+        vqshl.s16       q8,  q8,  #1
+        vqshl.s16       q9,  q9,  #1
+        vqshl.s16       q10, q10, #1
+        vqshl.s16       q11, q11, #1
+        bx              lr
+endfunc
+
+.macro def_fn_8x8_base variant
+function inv_txfm_\variant\()add_8x8_neon
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vld1.16         {q8,  q9},  [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q10, q11}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q12, q13}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]!
+        vld1.16         {q14, q15}, [r2, :128]
+        vst1.16         {q0,  q1},  [r2, :128]
+
+.ifc \variant, identity_
+        // The identity shl #1 and downshift srshr #1 cancel out
+.else
+        blx             r4
+
+        vrshr.s16       q8,  q8,  #1
+        vrshr.s16       q9,  q9,  #1
+        vrshr.s16       q10, q10, #1
+        vrshr.s16       q11, q11, #1
+        vrshr.s16       q12, q12, #1
+        vrshr.s16       q13, q13, #1
+        vrshr.s16       q14, q14, #1
+        vrshr.s16       q15, q15, #1
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        blx             r5
+
+        load_add_store_8x8 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+.endm
+
+def_fn_8x8_base
+def_fn_8x8_base identity_
+
+.macro def_fn_8x8 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         8,   8,   1
+.endif
+        push            {r4-r5,r7,lr}
+        vpush           {q4-q7}
+        movrel_local    r5,  inv_\txfm2\()_8h_x8_neon
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_8x8_neon
+.else
+        movrel_local    r4,  inv_\txfm1\()_8h_x8_neon
+        b               inv_txfm_add_8x8_neon
+.endif
+endfunc
+.endm
+
+def_fn_8x8 dct, dct
+def_fn_8x8 identity, identity
+def_fn_8x8 dct, adst
+def_fn_8x8 dct, flipadst
+def_fn_8x8 dct, identity
+def_fn_8x8 adst, dct
+def_fn_8x8 adst, adst
+def_fn_8x8 adst, flipadst
+def_fn_8x8 flipadst, dct
+def_fn_8x8 flipadst, adst
+def_fn_8x8 flipadst, flipadst
+def_fn_8x8 identity, dct
+def_fn_8x8 adst, identity
+def_fn_8x8 flipadst, identity
+def_fn_8x8 identity, adst
+def_fn_8x8 identity, flipadst
+
+function inv_txfm_add_8x4_neon
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]!
+        vld1.16         {d20, d21, d22, d23}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]
+
+        scale_input     d0[0], q8,  q9, q10, q11
+
+        blx             r4
+
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        vswp            d17, d20
+        vswp            d19, d21
+        vswp            d18, d20
+        vswp            d21, d22
+
+        blx             r5
+
+        load_add_store_8x4 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+
+function inv_txfm_add_4x8_neon
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+        vld1.16         {q8,  q9},  [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]!
+        vld1.16         {q10, q11}, [r2, :128]
+        vst1.16         {q14, q15}, [r2, :128]
+
+        scale_input     d0[0], q8,  q9, q10, q11
+
+        blx             r4
+
+        transpose_4x8h  q8,  q9,  q10, q11
+        vswp            d17, d20
+        vswp            d19, d21
+        vswp            d17, d18
+        vswp            d19, d22
+
+        blx             r5
+
+        load_add_store_4x8 r0, r7
+        vpop            {q4-q7}
+        pop             {r4-r5,r7,pc}
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  0
+.endif
+        push            {r4-r5,r7,lr}
+        vpush           {q4-q7}
+        movrel_local    r4,  inv_\txfm1\()_\h\()h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_\w\()h_x\h\()_neon
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct
+def_fn_48 \w, \h, identity, identity
+def_fn_48 \w, \h, dct, adst
+def_fn_48 \w, \h, dct, flipadst
+def_fn_48 \w, \h, dct, identity
+def_fn_48 \w, \h, adst, dct
+def_fn_48 \w, \h, adst, adst
+def_fn_48 \w, \h, adst, flipadst
+def_fn_48 \w, \h, flipadst, dct
+def_fn_48 \w, \h, flipadst, adst
+def_fn_48 \w, \h, flipadst, flipadst
+def_fn_48 \w, \h, identity, dct
+def_fn_48 \w, \h, adst, identity
+def_fn_48 \w, \h, flipadst, identity
+def_fn_48 \w, \h, identity, adst
+def_fn_48 \w, \h, identity, flipadst
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+function inv_dct_4h_x16_neon, export=1
+        movrel_local    r12, idct_coeffs
+        vld1.16         {q0, q1}, [r12, :128]
+
+        vmull_vmlsl     q2,  d17, d31, d2[0], d2[1]  // -> t8a
+        vmull_vmlal     q3,  d17, d31, d2[1], d2[0]  // -> t15a
+        vmull_vmlsl     q4,  d25, d23, d2[2], d2[3]  // -> t9a
+        vrshrn.i32      d17, q2,  #12                // t8a
+        vrshrn.i32      d31, q3,  #12                // t15a
+        vmull_vmlal     q2,  d25, d23, d2[3], d2[2]  // -> t14a
+        vmull_vmlsl     q3,  d21, d27, d3[0], d3[1]  // -> t10a
+        vrshrn.i32      d23, q4,  #12                // t9a
+        vrshrn.i32      d25, q2,  #12                // t14a
+        vmull_vmlal     q4,  d21, d27, d3[1], d3[0]  // -> t13a
+        vmull_vmlsl     q2,  d29, d19, d3[2], d3[3]  // -> t11a
+        vrshrn.i32      d21, q3,  #12                // t10a
+        vrshrn.i32      d27, q4,  #12                // t13a
+        vmull_vmlal     q3,  d29, d19, d3[3], d3[2]  // -> t12a
+        vrshrn.i32      d19, q2,  #12                // t11a
+        vrshrn.i32      d29, q3,  #12                // t12a
+
+        idct_4h_x8      d16, d18, d20, d22, d24, d26, d28, d30
+
+        vqsub.s16       d4,  d17, d23  // t9
+        vqadd.s16       d17, d17, d23  // t8
+        vqsub.s16       d5,  d31, d25  // t14
+        vqadd.s16       d31, d31, d25  // t15
+        vqsub.s16       d23, d19, d21  // t10
+        vqadd.s16       d19, d19, d21  // t11
+        vqadd.s16       d25, d29, d27  // t12
+        vqsub.s16       d29, d29, d27  // t13
+
+        vmull_vmlsl     q3,  d5,  d4,  d0[2], d0[3]  // -> t9a
+        vmull_vmlal     q4,  d5,  d4,  d0[3], d0[2]  // -> t14a
+        vrshrn.i32      d21, q3,  #12                // t9a
+        vrshrn.i32      d27, q4,  #12                // t14a
+
+        vmull_vmlsl     q3,  d29, d23, d0[2], d0[3]  // -> t13a
+        vmull_vmlal     q4,  d29, d23, d0[3], d0[2]  // -> t10a
+        vrshrn.i32      d29, q3,  #12                // t13a
+        vneg.s32        q4,  q4
+        vrshrn.i32      d23, q4,  #12                // t10a
+
+        vqsub.s16       d4,  d17, d19  // t11a
+        vqadd.s16       d17, d17, d19  // t8a
+        vqsub.s16       d5,  d31, d25  // t12a
+        vqadd.s16       d31, d31, d25  // t15a
+        vqadd.s16       d19, d21, d23  // t9
+        vqsub.s16       d21, d21, d23  // t10
+        vqsub.s16       d25, d27, d29  // t13
+        vqadd.s16       d27, d27, d29  // t14
+
+        vmull_vmlsl     q3,  d5,  d4,  d0[0], d0[0]  // -> t11
+        vmull_vmlal     q4,  d5,  d4,  d0[0], d0[0]  // -> t12
+        vmull_vmlsl     q2,  d25, d21, d0[0], d0[0]  // -> t10a
+
+        vrshrn.i32      d6,  q3,  #12  // t11
+        vrshrn.i32      d7,  q4,  #12  // t12
+        vmull_vmlal     q4,  d25, d21, d0[0], d0[0]  // -> t10a
+        vrshrn.i32      d4,  q2,  #12  // t10a
+        vrshrn.i32      d5,  q4,  #12  // t13a
+
+        vqadd.s16       d8,  d16, d31  // out0
+        vqsub.s16       d31, d16, d31  // out15
+        vmov            d16, d8
+        vqadd.s16       d23, d30, d17  // out7
+        vqsub.s16       d9,  d30, d17  // out8
+        vqadd.s16       d17, d18, d27  // out1
+        vqsub.s16       d30, d18, d27  // out14
+        vqadd.s16       d18, d20, d5   // out2
+        vqsub.s16       d29, d20, d5   // out13
+        vqadd.s16       d5,  d28, d19  // out6
+        vqsub.s16       d25, d28, d19  // out9
+        vqadd.s16       d19, d22, d7   // out3
+        vqsub.s16       d28, d22, d7   // out12
+        vqadd.s16       d20, d24, d6   // out4
+        vqsub.s16       d27, d24, d6   // out11
+        vqadd.s16       d21, d26, d4   // out5
+        vqsub.s16       d26, d26, d4   // out10
+        vmov            d24, d9
+        vmov            d22, d5
+
+        bx              lr
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+        movrel_local    r12, iadst16_coeffs
+        vld1.16         {q0, q1}, [r12, :128]
+        movrel_local    r12, idct_coeffs
+
+        vmull_vmlal     q2,  d31, d16, d0[0], d0[1] // -> t0
+        vmull_vmlsl     q3,  d31, d16, d0[1], d0[0] // -> t1
+        vmull_vmlal     q4,  d29, d18, d0[2], d0[3] // -> t2
+        vrshrn.i32      d16, q2,  #12               // t0
+        vrshrn.i32      d31, q3,  #12               // t1
+        vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t3
+        vmull_vmlal     q3,  d27, d20, d1[0], d1[1] // -> t4
+        vrshrn.i32      d18, q4,  #12               // t2
+        vrshrn.i32      d29, q2,  #12               // t3
+        vmull_vmlsl     q4,  d27, d20, d1[1], d1[0] // -> t5
+        vmull_vmlal     q2,  d25, d22, d1[2], d1[3] // -> t6
+        vrshrn.i32      d20, q3,  #12               // t4
+        vrshrn.i32      d27, q4,  #12               // t5
+        vmull_vmlsl     q3,  d25, d22, d1[3], d1[2] // -> t7
+        vmull_vmlal     q4,  d23, d24, d2[0], d2[1] // -> t8
+        vrshrn.i32      d22, q2,  #12               // t6
+        vrshrn.i32      d25, q3,  #12               // t7
+        vmull_vmlsl     q2,  d23, d24, d2[1], d2[0] // -> t9
+        vmull_vmlal     q3,  d21, d26, d2[2], d2[3] // -> t10
+        vrshrn.i32      d23, q4,  #12               // t8
+        vrshrn.i32      d24, q2,  #12               // t9
+        vmull_vmlsl     q4,  d21, d26, d2[3], d2[2] // -> t11
+        vmull_vmlal     q2,  d19, d28, d3[0], d3[1] // -> t12
+        vrshrn.i32      d21, q3,  #12               // t10
+        vrshrn.i32      d26, q4,  #12               // t11
+        vmull_vmlsl     q3,  d19, d28, d3[1], d3[0] // -> t13
+        vmull_vmlal     q4,  d17, d30, d3[2], d3[3] // -> t14
+        vrshrn.i32      d19, q2,  #12               // t12
+        vrshrn.i32      d28, q3,  #12               // t13
+        vmull_vmlsl     q2,  d17, d30, d3[3], d3[2] // -> t15
+        vrshrn.i32      d17, q4,  #12               // t14
+        vrshrn.i32      d30, q2,  #12               // t15
+
+        vld1.16         {q0}, [r12, :128]
+
+        vqsub.s16       d2,  d16, d23 // t8a
+        vqadd.s16       d16, d16, d23 // t0a
+        vqsub.s16       d3,  d31, d24 // t9a
+        vqadd.s16       d31, d31, d24 // t1a
+        vqadd.s16       d23, d18, d21 // t2a
+        vqsub.s16       d18, d18, d21 // t10a
+        vqadd.s16       d24, d29, d26 // t3a
+        vqsub.s16       d29, d29, d26 // t11a
+        vqadd.s16       d21, d20, d19 // t4a
+        vqsub.s16       d20, d20, d19 // t12a
+        vqadd.s16       d26, d27, d28 // t5a
+        vqsub.s16       d27, d27, d28 // t13a
+        vqadd.s16       d19, d22, d17 // t6a
+        vqsub.s16       d22, d22, d17 // t14a
+        vqadd.s16       d28, d25, d30 // t7a
+        vqsub.s16       d25, d25, d30 // t15a
+
+        vmull_vmlal     q2,  d2,  d3,  d1[1], d1[0] // -> t8
+        vmull_vmlsl     q3,  d2,  d3,  d1[0], d1[1] // -> t9
+        vmull_vmlal     q4,  d18, d29, d1[3], d1[2] // -> t10
+        vrshrn.i32      d17, q2,  #12               // t8
+        vrshrn.i32      d30, q3,  #12               // t9
+        vmull_vmlsl     q2,  d18, d29, d1[2], d1[3] // -> t11
+        vmull_vmlsl     q3,  d27, d20, d1[1], d1[0] // -> t12
+        vrshrn.i32      d18, q4,  #12               // t10
+        vrshrn.i32      d29, q2,  #12               // t11
+        vmull_vmlal     q4,  d27, d20, d1[0], d1[1] // -> t13
+        vmull_vmlsl     q2,  d25, d22, d1[3], d1[2] // -> t14
+        vrshrn.i32      d27, q3,  #12               // t12
+        vrshrn.i32      d20, q4,  #12               // t13
+        vmull_vmlal     q3,  d25, d22, d1[2], d1[3] // -> t15
+        vrshrn.i32      d25, q2,  #12               // t14
+        vrshrn.i32      d22, q3,  #12               // t15
+
+        vqsub.s16       d2,  d16, d21 // t4
+        vqadd.s16       d16, d16, d21 // t0
+        vqsub.s16       d3,  d31, d26 // t5
+        vqadd.s16       d31, d31, d26 // t1
+        vqadd.s16       d21, d23, d19 // t2
+        vqsub.s16       d23, d23, d19 // t6
+        vqadd.s16       d26, d24, d28 // t3
+        vqsub.s16       d24, d24, d28 // t7
+        vqadd.s16       d19, d17, d27 // t8a
+        vqsub.s16       d17, d17, d27 // t12a
+        vqadd.s16       d28, d30, d20 // t9a
+        vqsub.s16       d30, d30, d20 // t13a
+        vqadd.s16       d27, d18, d25 // t10a
+        vqsub.s16       d18, d18, d25 // t14a
+        vqadd.s16       d20, d29, d22 // t11a
+        vqsub.s16       d29, d29, d22 // t15a
+
+        vmull_vmlal     q2,  d2,  d3,  d0[3], d0[2] // -> t4a
+        vmull_vmlsl     q3,  d2,  d3,  d0[2], d0[3] // -> t5a
+        vmull_vmlsl     q4,  d24, d23, d0[3], d0[2] // -> t6a
+        vrshrn.i32      d22, q2,  #12               // t4a
+        vrshrn.i32      d25, q3,  #12               // t5a
+        vmull_vmlal     q2,  d24, d23, d0[2], d0[3] // -> t7a
+        vmull_vmlal     q3,  d17, d30, d0[3], d0[2] // -> t12
+        vrshrn.i32      d24, q4,  #12               // t6a
+        vrshrn.i32      d23, q2,  #12               // t7a
+        vmull_vmlsl     q4,  d17, d30, d0[2], d0[3] // -> t13
+        vmull_vmlsl     q2,  d29, d18, d0[3], d0[2] // -> t14
+        vrshrn.i32      d17, q3,  #12               // t12
+        vmull_vmlal     q3,  d29, d18, d0[2], d0[3] // -> t15
+        vrshrn.i32      d29, q4,  #12               // t13
+        vrshrn.i32      d30, q2,  #12               // t14
+        vrshrn.i32      d18, q3,  #12               // t15
+
+        vqsub.s16       d2,  d16, d21 // t2a
+.ifc \o0, d16
+        vqadd.s16       \o0, d16, d21 // out0
+        vqsub.s16       d21, d31, d26 // t3a
+        vqadd.s16       \o15,d31, d26 // out15
+.else
+        vqadd.s16       d4,  d16, d21 // out0
+        vqsub.s16       d21, d31, d26 // t3a
+        vqadd.s16       \o15,d31, d26 // out15
+        vmov            \o0, d4
+.endif
+        vqneg.s16       \o15, \o15    // out15
+
+        vqsub.s16       d3,  d29, d18 // t15a
+        vqadd.s16       \o13,d29, d18 // out13
+        vqadd.s16       \o2, d17, d30 // out2
+        vqsub.s16       d26, d17, d30 // t14a
+        vqneg.s16       \o13,\o13     // out13
+
+        vqadd.s16       \o1, d19, d27 // out1
+        vqsub.s16       d27, d19, d27 // t10
+        vqadd.s16       \o14,d28, d20 // out14
+        vqsub.s16       d20, d28, d20 // t11
+        vqneg.s16       \o1, \o1      // out1
+
+        vqadd.s16       \o3, d22, d24 // out3
+        vqsub.s16       d22, d22, d24 // t6
+        vqadd.s16       \o12,d25, d23 // out12
+        vqsub.s16       d23, d25, d23 // t7
+        vqneg.s16       \o3, \o3      // out3
+
+        vmull_vmlsl     q12, d2,  d21, d0[0], d0[0] // -> out8 (d24 or d23)
+        vmull_vmlal     q2,  d2,  d21, d0[0], d0[0] // -> out7 (d23 or d24)
+        vmull_vmlal     q3,  d26, d3,  d0[0], d0[0] // -> out5 (d21 or d26)
+
+        vrshrn.i32      d24, q12, #12 // out8
+        vrshrn.i32      d4,  q2,  #12 // out7
+        vrshrn.i32      d5,  q3,  #12 // out5
+        vmull_vmlsl     q4,  d26, d3,  d0[0], d0[0] // -> out10 (d26 or d21)
+        vmull_vmlal     q1,  d22, d23, d0[0], d0[0] // -> out4 (d20 or d27)
+        vrshrn.i32      d26, q4,  #12 // out10
+
+        vmull_vmlsl     q4,  d22, d23, d0[0], d0[0] // -> out11 (d27 or d20)
+        vmull_vmlal     q11, d27, d20, d0[0], d0[0] // -> out6 (d22 or d25)
+        vmull_vmlsl     q3,  d27, d20, d0[0], d0[0] // -> out9 (d25 or d22)
+
+        vrshrn.i32      \o4, q1,  #12 // out4
+        vrshrn.i32      d7,  q3,  #12 // out9
+        vrshrn.i32      d6,  q4,  #12 // out11
+        vrshrn.i32      \o6, q11, #12 // out6
+
+.ifc \o8, d23
+        vmov            \o8, d24
+        vmov            \o10,d26
+.endif
+
+        vqneg.s16       \o7, d4  // out7
+        vqneg.s16       \o5, d5  // out5
+        vqneg.s16       \o11,d6  // out11
+        vqneg.s16       \o9, d7  // out9
+.endm
+
+function inv_adst_4h_x16_neon, export=1
+        iadst_16        d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        bx              lr
+endfunc
+
+function inv_flipadst_4h_x16_neon, export=1
+        iadst_16        d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+        bx              lr
+endfunc
+
+function inv_identity_4h_x16_neon, export=1
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q1,  \i,  d0[0]
+        vqadd.s16       \i,  \i,  \i
+        vqadd.s16       \i,  \i,  q1
+.endr
+        bx              lr
+endfunc
+
+.macro identity_4x16_shift2 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vshr.s16        q2,  q2,  #1
+        vrhadd.s16      \i,  \i,  q2
+.endr
+.endm
+
+.macro identity_4x16_shift1 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vrshr.s16       q2,  q2,  #1
+        vqadd.s16       \i,  \i,  q2
+.endr
+.endm
+
+.macro identity_8x8_shift1 c
+        identity_4x16_shift1 \c
+.endm
+
+.macro identity_8x8 c
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vqrdmulh.s16    q2,  \i,  \c
+        vqadd.s16       \i,  \i,  \i
+        vqadd.s16       \i,  \i,  q2
+.endr
+.endm
+
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+        push            {lr}
+        vmov.i16        d7,  #0
+.if \identity
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.endif
+.if \scale
+        movw            r12, #2896*8
+        vdup.16         d1,  r12
+.endif
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+.if \scale
+        scale_input     d1[0], q8,  q9, q10, q11, q12, q13, q14, q15
+.endif
+.if \identity
+.if \shift == -2
+        identity_4x16_shift2 d0[0]
+.else
+        identity_4x16_shift1 d0[0]
+.endif
+.else
+        blx             r4
+.endif
+.if \shift > 0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \i,  \i,  #\shift
+.endr
+.endif
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        transpose_4x4h  q14, q15, d28, d29, d30, d31
+
+.irp i, d16, d20, d24, d28, d17, d21, d25, d29, d18, d22, d26, d30, d19, d23, d27, d31
+        vst1.16         {\i}, [r6, :64]!
+.endr
+
+        pop             {pc}
+endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=-2, suffix=_identity
+def_horz_16 scale=1, identity=1, shift=-1, suffix=_scale_identity
+
+function inv_txfm_add_vert_4x16_neon
+        push            {lr}
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        blx             r5
+        load_add_store_4x16 r6, r7
+        pop             {pc}
+endfunc
+
+.macro sub_sp_align space
+#if CONFIG_THUMB
+        mov             r7,  sp
+        and             r7,  r7,  #15
+#else
+        and             r7,  sp,  #15
+#endif
+        sub             sp,  sp,  r7
+        // Now the stack is aligned, store the amount of adjustment back
+        // on the stack, as we don't want to waste a register as frame
+        // pointer.
+        str             r7,  [sp, #-16]!
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             r7,  sp,  #4096
+        ldr             r12, [r7]
+        sub             r7,  r7,  #(\space - 4096)
+        mov             sp,  r7
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+.macro add_sp_align space
+.if \space >= 4096
+        add             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        add             sp,  sp,  #(\space)%4096
+.endif
+        ldr             r7,  [sp], #16
+        // Add back the original stack adjustment
+        add             sp,  sp,  r7
+.endm
+
+function inv_txfm_add_16x16_neon
+        sub_sp_align    512
+        ldrh            r11, [r10], #2
+.irp i, 0, 4, 8, 12
+        add             r6,  sp,  #(\i*16*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 12
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+        blx             r9
+.endr
+        b               3f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+3:
+.irp i, 0, 4, 8, 12
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    512
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+const eob_16x16
+        .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+        .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         16,  16,  2
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4}
+.ifc \txfm1, identity
+        movrel_local    r9,  inv_txfm_horz_identity_16x4_neon
+.else
+        movrel_local    r9,  inv_txfm_horz_16x4_neon
+        movrel_local    r4,  inv_\txfm1\()_4h_x16_neon
+.endif
+        movrel_local    r5,  inv_\txfm2\()_4h_x16_neon
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel_local    r10, eob_16x16
+.else
+        movrel_local    r10, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+        movrel_local    r10, eob_16x16_identity
+.else
+        movrel_local    r10, eob_16x16
+.endif
+.endif
+        b               inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+.macro def_fn_416_base variant
+function inv_txfm_\variant\()add_16x4_neon
+
+.ifc \variant, identity_
+        vmov.i16        d4,  #0
+.irp i, d16, d18, d20, d22
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+.irp i, d17, d19, d21, d23
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+        movw            r12, #2*(5793-4096)*8
+        vdup.16         d0,  r12
+.irp i, d24, d26, d28, d30
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+.irp i, d25, d27, d29, d31
+        vld1.16         {\i}, [r2, :64]
+        vst1.16         {d4}, [r2, :64]!
+.endr
+
+        identity_4x16_shift1 d0[0]
+.else
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        vld1.16         {d16, d17, d18, d19}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d20, d21, d22, d23}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r2, :128]
+        vst1.16         {q2,  q3}, [r2, :128]!
+
+        blx             r4
+
+        vswp            d17, d20
+        vswp            d19, d22
+        vswp            d18, d20
+        vswp            d19, d21
+.irp i, q8, q9, q10, q11
+        vrshr.s16       \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        blx             r5
+        mov             r6,  r0
+        load_add_store_8x4 r6, r7
+
+.ifc \variant, identity_
+        vmov            q8,  q12
+        vmov            q9,  q13
+        vmov            q10, q14
+        vmov            q11, q15
+.else
+        vswp            d25, d28
+        vswp            d27, d30
+        vswp            d26, d28
+        vswp            d27, d29
+        vrshr.s16       q8,  q12, #1
+        vrshr.s16       q9,  q13, #1
+        vrshr.s16       q10, q14, #1
+        vrshr.s16       q11, q15, #1
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        blx             r5
+        add             r6,  r0,  #8
+        load_add_store_8x4 r6, r7
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_4x16_neon
+        vmov.i16        q2,  #0
+
+        mov             r11, #32
+        cmp             r3,  r10
+        blt             1f
+
+        add             r6,  r2,  #16
+.ifc \variant, identity_
+.irp i, q12, q13, q14, q15
+        vld1.16         {\i}, [r6, :128]
+        vst1.16         {q2}, [r6, :128], r11
+.endr
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        identity_8x4_shift1 q12, q13, q14, q15, d0[0]
+.else
+.irp i, q8,  q9,  q10, q11
+        vld1.16         {\i}, [r6, :128]
+        vst1.16         {q2}, [r6, :128], r11
+.endr
+        blx             r4
+        vrshr.s16       q12, q8,  #1
+        vrshr.s16       q13, q9,  #1
+        vrshr.s16       q14, q10, #1
+        vrshr.s16       q15, q11, #1
+.endif
+        transpose_4x8h  q12, q13, q14, q15
+        vswp            d27, d29
+        vswp            d26, d28
+        vswp            d27, d30
+        vswp            d25, d28
+
+        b               2f
+1:
+.irp i, q12, q13, q14, q15
+        vmov.i16        \i,  #0
+.endr
+2:
+        vmov.i16        q2,  #0
+.irp i, q8,  q9,  q10, q11
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q2}, [r2, :128], r11
+.endr
+.ifc \variant, identity_
+        movw            r12, #(5793-4096)*8
+        vdup.16         d0,  r12
+        identity_8x4_shift1 q8,  q9,  q10, q11, d0[0]
+.else
+        blx             r4
+.irp i, q8, q9, q10, q11
+        vrshr.s16       \i,  \i,  #1
+.endr
+.endif
+        transpose_4x8h  q8,  q9,  q10, q11
+        vswp            d19, d21
+        vswp            d18, d20
+        vswp            d19, d22
+        vswp            d17, d20
+
+        blx             r5
+
+        load_add_store_4x16 r0, r6
+
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_416_base
+def_fn_416_base identity_
+
+.macro def_fn_416 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+.if \w == 4
+        movrel_local    r4,  inv_\txfm1\()_8h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x\h\()_neon
+        mov             r10, #\eob_half
+.else
+        movrel_local    r4,  inv_\txfm1\()_4h_x\w\()_neon
+        movrel_local    r5,  inv_\txfm2\()_8h_x\h\()_neon
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct, 29
+def_fn_416 \w, \h, identity, identity, 29
+def_fn_416 \w, \h, dct, adst, 29
+def_fn_416 \w, \h, dct, flipadst, 29
+def_fn_416 \w, \h, dct, identity, 8
+def_fn_416 \w, \h, adst, dct, 29
+def_fn_416 \w, \h, adst, adst, 29
+def_fn_416 \w, \h, adst, flipadst, 29
+def_fn_416 \w, \h, flipadst, dct, 29
+def_fn_416 \w, \h, flipadst, adst, 29
+def_fn_416 \w, \h, flipadst, flipadst, 29
+def_fn_416 \w, \h, identity, dct, 32
+def_fn_416 \w, \h, adst, identity, 8
+def_fn_416 \w, \h, flipadst, identity, 8
+def_fn_416 \w, \h, identity, adst, 32
+def_fn_416 \w, \h, identity, flipadst, 32
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+.macro def_fn_816_base variant
+function inv_txfm_\variant\()add_16x8_neon
+        sub_sp_align    256
+
+.irp i, 0, 4
+        add             r6,  sp,  #(\i*16*2)
+.if \i > 0
+        cmp             r3,  r10
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #8*2
+        blx             r9
+.endr
+        b               2f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 8
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\j}, [r7, :128], r8
+.endr
+        blx             r5
+
+        add             r6,  r0,  #(\i)
+        load_add_store_8x8 r6, r7
+.endr
+
+        add_sp_align    256
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_\variant\()add_8x16_neon
+        sub_sp_align    256
+
+.irp i, 0, 8
+        add             r6,  sp,  #(\i*8*2)
+.if \i > 0
+        cmp             r3,  r10
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+
+        vmov.i16        q2,  #0
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\j}, [r7, :128]
+        vst1.16         {q2}, [r7, :128], r8
+.endr
+        scale_input     d0[0], q8,  q9,  q10, q11, q12, q13, q14, q15
+.ifc \variant, identity_
+        // The identity shl #1 and downshift vrshr #1 cancel out
+.else
+        blx             r4
+.irp j, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \j,  \j,  #1
+.endr
+.endif
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+        vst1.16         {q8,  q9},  [r6, :128]!
+        vst1.16         {q10, q11}, [r6, :128]!
+        vst1.16         {q12, q13}, [r6, :128]!
+        vst1.16         {q14, q15}, [r6, :128]!
+.endr
+        b               2f
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+2:
+
+.irp i, 0, 4
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #16
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    256
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+.endm
+
+def_fn_816_base
+def_fn_816_base identity_
+
+.macro def_fn_816 w, h, txfm1, txfm2, eob_8x8, eob_4x4
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+.if \w == 8
+        movrel_local    r4,  inv_\txfm1\()_8h_x8_neon
+        movrel_local    r5,  inv_\txfm2\()_4h_x16_neon
+.else
+.ifc \txfm1, identity
+        movrel_local    r9,  inv_txfm_horz_scale_identity_16x4_neon
+.else
+        movrel_local    r4,  inv_\txfm1\()_4h_x16_neon
+        movrel_local    r9,  inv_txfm_horz_scale_16x4_neon
+.endif
+        movrel_local    r5,  inv_\txfm2\()_8h_x8_neon
+.endif
+.if \w == 8
+        mov             r10, #\eob_8x8
+.else
+        mov             r10, #\eob_4x4
+.endif
+.ifc \txfm1, identity
+        b               inv_txfm_identity_add_\w\()x\h\()_neon
+.else
+        b               inv_txfm_add_\w\()x\h\()_neon
+.endif
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct, 43, 10
+def_fn_816 \w, \h, identity, identity, 43, 10
+def_fn_816 \w, \h, dct, adst, 43, 10
+def_fn_816 \w, \h, dct, flipadst, 43, 10
+def_fn_816 \w, \h, dct, identity, 8, 4
+def_fn_816 \w, \h, adst, dct, 43, 10
+def_fn_816 \w, \h, adst, adst, 43, 10
+def_fn_816 \w, \h, adst, flipadst, 43, 10
+def_fn_816 \w, \h, flipadst, dct, 43, 10
+def_fn_816 \w, \h, flipadst, adst, 43, 10
+def_fn_816 \w, \h, flipadst, flipadst, 43, 10
+def_fn_816 \w, \h, identity, dct, 64, 4
+def_fn_816 \w, \h, adst, identity, 8, 4
+def_fn_816 \w, \h, flipadst, identity, 8, 4
+def_fn_816 \w, \h, identity, adst, 64, 4
+def_fn_816 \w, \h, identity, flipadst, 64, 4
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4h_x16_neon, export=1
+        movrel_local    r12, idct_coeffs, 2*16
+        vld1.16         {q0, q1}, [r12, :128]
+        sub             r12, r12, #2*16
+
+        vmull_vmlsl     q2,  d16, d31, d0[0], d0[1] // -> t16a
+        vmull_vmlal     q3,  d16, d31, d0[1], d0[0] // -> t31a
+        vmull_vmlsl     q4,  d24, d23, d0[2], d0[3] // -> t17a
+        vrshrn.i32      d16, q2,  #12               // t16a
+        vrshrn.i32      d31, q3,  #12               // t31a
+        vmull_vmlal     q2,  d24, d23, d0[3], d0[2] // -> t30a
+        vmull_vmlsl     q3,  d20, d27, d1[0], d1[1] // -> t18a
+        vrshrn.i32      d24, q4,  #12               // t17a
+        vrshrn.i32      d23, q2,  #12               // t30a
+        vmull_vmlal     q4,  d20, d27, d1[1], d1[0] // -> t29a
+        vmull_vmlsl     q2,  d28, d19, d1[2], d1[3] // -> t19a
+        vrshrn.i32      d20, q3,  #12               // t18a
+        vrshrn.i32      d27, q4,  #12               // t29a
+        vmull_vmlal     q3,  d28, d19, d1[3], d1[2] // -> t28a
+        vmull_vmlsl     q4,  d18, d29, d2[0], d2[1] // -> t20a
+        vrshrn.i32      d28, q2,  #12               // t19a
+        vrshrn.i32      d19, q3,  #12               // t28a
+        vmull_vmlal     q2,  d18, d29, d2[1], d2[0] // -> t27a
+        vmull_vmlsl     q3,  d26, d21, d2[2], d2[3] // -> t21a
+        vrshrn.i32      d18, q4,  #12               // t20a
+        vrshrn.i32      d29, q2,  #12               // t27a
+        vmull_vmlal     q4,  d26, d21, d2[3], d2[2] // -> t26a
+        vmull_vmlsl     q2,  d22, d25, d3[0], d3[1] // -> t22a
+        vrshrn.i32      d26, q3,  #12               // t21a
+        vrshrn.i32      d21, q4,  #12               // t26a
+        vmull_vmlal     q3,  d22, d25, d3[1], d3[0] // -> t25a
+        vmull_vmlsl     q4,  d30, d17, d3[2], d3[3] // -> t23a
+        vrshrn.i32      d22, q2,  #12               // t22a
+        vrshrn.i32      d25, q3,  #12               // t25a
+        vmull_vmlal     q2,  d30, d17, d3[3], d3[2] // -> t24a
+        vrshrn.i32      d30, q4,  #12               // t23a
+        vrshrn.i32      d17, q2,  #12               // t24a
+
+        vld1.16         {q0}, [r12, :128]
+
+        vqsub.s16       d2,  d16, d24 // t17
+        vqadd.s16       d16, d16, d24 // t16
+        vqsub.s16       d3,  d31, d23 // t30
+        vqadd.s16       d31, d31, d23 // t31
+        vqsub.s16       d24, d28, d20 // t18
+        vqadd.s16       d28, d28, d20 // t19
+        vqadd.s16       d23, d18, d26 // t20
+        vqsub.s16       d18, d18, d26 // t21
+        vqsub.s16       d20, d30, d22 // t22
+        vqadd.s16       d30, d30, d22 // t23
+        vqadd.s16       d26, d17, d25 // t24
+        vqsub.s16       d17, d17, d25 // t25
+        vqsub.s16       d22, d29, d21 // t26
+        vqadd.s16       d29, d29, d21 // t27
+        vqadd.s16       d25, d19, d27 // t28
+        vqsub.s16       d19, d19, d27 // t29
+
+        vmull_vmlsl     q2,  d3,  d2,  d1[0], d1[1] // -> t17a
+        vmull_vmlal     q3,  d3,  d2,  d1[1], d1[0] // -> t30a
+        vmull_vmlal     q4,  d19, d24, d1[1], d1[0] // -> t18a
+        vrshrn.i32      d21, q2,  #12               // t17a
+        vrshrn.i32      d27, q3,  #12               // t30a
+        vneg.s32        q4,  q4                     // -> t18a
+        vmull_vmlsl     q1,  d19, d24, d1[0], d1[1] // -> t29a
+        vmull_vmlsl     q2,  d22, d18, d1[2], d1[3] // -> t21a
+        vrshrn.i32      d19, q4,  #12               // t18a
+        vrshrn.i32      d24, q1,  #12               // t29a
+        vmull_vmlal     q3,  d22, d18, d1[3], d1[2] // -> t26a
+        vmull_vmlal     q4,  d17, d20, d1[3], d1[2] // -> t22a
+        vrshrn.i32      d22, q2,  #12               // t21a
+        vrshrn.i32      d18, q3,  #12               // t26a
+        vneg.s32        q4,  q4                     // -> t22a
+        vmull_vmlsl     q1,  d17, d20, d1[2], d1[3] // -> t25a
+        vrshrn.i32      d17, q4,  #12               // t22a
+        vrshrn.i32      d20, q1,  #12               // t25a
+
+        vqsub.s16       d2,  d27, d24 // t29
+        vqadd.s16       d27, d27, d24 // t30
+        vqsub.s16       d3,  d21, d19 // t18
+        vqadd.s16       d21, d21, d19 // t17
+        vqsub.s16       d24, d16, d28 // t19a
+        vqadd.s16       d16, d16, d28 // t16a
+        vqsub.s16       d19, d30, d23 // t20a
+        vqadd.s16       d30, d30, d23 // t23a
+        vqsub.s16       d28, d17, d22 // t21
+        vqadd.s16       d17, d17, d22 // t22
+        vqadd.s16       d23, d26, d29 // t24a
+        vqsub.s16       d26, d26, d29 // t27a
+        vqadd.s16       d22, d20, d18 // t25
+        vqsub.s16       d20, d20, d18 // t26
+        vqsub.s16       d29, d31, d25 // t28a
+        vqadd.s16       d31, d31, d25 // t31a
+
+        vmull_vmlsl     q2,  d2,  d3,  d0[2], d0[3] // -> t18a
+        vmull_vmlal     q3,  d2,  d3,  d0[3], d0[2] // -> t29a
+        vmull_vmlsl     q4,  d29, d24, d0[2], d0[3] // -> t19
+        vrshrn.i32      d18, q2,  #12               // t18a
+        vrshrn.i32      d25, q3,  #12               // t29a
+        vmull_vmlal     q1,  d29, d24, d0[3], d0[2] // -> t28
+        vmull_vmlal     q2,  d26, d19, d0[3], d0[2] // -> t20
+        vrshrn.i32      d29, q4,  #12               // t19
+        vrshrn.i32      d24, q1,  #12               // t28
+        vneg.s32        q2,  q2                     // -> t20
+        vmull_vmlsl     q3,  d26, d19, d0[2], d0[3] // -> t27
+        vmull_vmlal     q4,  d20, d28, d0[3], d0[2] // -> t21a
+        vrshrn.i32      d26, q2,  #12               // t20
+        vrshrn.i32      d19, q3,  #12               // t27
+        vneg.s32        q4,  q4                     // -> t21a
+        vmull_vmlsl     q1,  d20, d28, d0[2], d0[3] // -> t26a
+        vrshrn.i32      d20, q4,  #12               // t21a
+        vrshrn.i32      d28, q1,  #12               // t26a
+
+        vqsub.s16       d2,  d16, d30 // t23
+        vqadd.s16       d16, d16, d30 // t16 = out16
+        vqsub.s16       d3,  d31, d23 // t24
+        vqadd.s16       d31, d31, d23 // t31 = out31
+        vqsub.s16       d23, d21, d17 // t22a
+        vqadd.s16       d17, d21, d17 // t17a = out17
+        vqadd.s16       d30, d27, d22 // t30a = out30
+        vqsub.s16       d21, d27, d22 // t25a
+        vqsub.s16       d27, d18, d20 // t21
+        vqadd.s16       d18, d18, d20 // t18 = out18
+        vqadd.s16       d4,  d29, d26 // t19a = out19
+        vqsub.s16       d26, d29, d26 // t20a
+        vqadd.s16       d29, d25, d28 // t29 = out29
+        vqsub.s16       d25, d25, d28 // t26
+        vqadd.s16       d28, d24, d19 // t28a = out28
+        vqsub.s16       d24, d24, d19 // t27a
+        vmov            d19, d4       // out19
+
+        vmull_vmlsl     q2,  d24, d26, d0[0], d0[0] // -> t20
+        vmull_vmlal     q3,  d24, d26, d0[0], d0[0] // -> t27
+        vrshrn.i32      d20, q2,  #12   // t20
+        vrshrn.i32      d22, q3,  #12   // t27
+
+        vmull_vmlal     q2,  d25, d27, d0[0], d0[0] // -> t26a
+        vmull_vmlsl     q3,  d25, d27, d0[0], d0[0] // -> t21a
+        vmov            d27, d22        // t27
+        vrshrn.i32      d26, q2,  #12   // t26a
+
+        vmull_vmlsl     q12, d21, d23, d0[0], d0[0] // -> t22
+        vmull_vmlal     q2,  d21, d23, d0[0], d0[0] // -> t25
+        vrshrn.i32      d21, q3,  #12   // t21a
+        vrshrn.i32      d22, q12, #12   // t22
+        vrshrn.i32      d25, q2,  #12   // t25
+
+        vmull_vmlsl     q2,  d3,  d2,  d0[0], d0[0] // -> t23a
+        vmull_vmlal     q3,  d3,  d2,  d0[0], d0[0] // -> t24a
+        vrshrn.i32      d23, q2,  #12   // t23a
+        vrshrn.i32      d24, q3,  #12   // t24a
+
+        bx              lr
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+        push            {lr}
+        vmov.i16        d7,  #0
+        lsl             r8,  r8,  #1
+.if \scale
+        movw            r12, #2896*8
+        vdup.16         d0,  r12
+.endif
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        add             r7,  r7,  r8, lsr #1
+.if \scale
+        scale_input     d0[0], q8,  q9,  q10, q11, q12, q13, q14, q15
+.endif
+        bl              inv_dct_4h_x16_neon
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q12, q13, d24, d25, d26, d27
+        transpose_4x4h  q14, q15, d28, d29, d30, d31
+
+.macro store1 r0, r1, r2, r3
+        vst1.16         {\r0}, [r6, :64]!
+        vst1.16         {\r1}, [r6, :64]!
+        vst1.16         {\r2}, [r6, :64]!
+        vst1.16         {\r3}, [r6, :64]!
+        add             r6,  r6,  #32
+.endm
+        store1          d16, d20, d24, d28
+        store1          d17, d21, d25, d29
+        store1          d18, d22, d26, d30
+        store1          d19, d23, d27, d31
+.purgem store1
+        sub             r6,  r6,  #64*4
+
+        vmov.i16        d7,  #0
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64]
+        vst1.16         {d7}, [r7, :64], r8
+.endr
+.if \scale
+        // This relies on the fact that the idct also leaves the right coeff in d0[1]
+        scale_input     d0[1], q8,  q9,  q10, q11, q12, q13, q14, q15
+.endif
+        bl              inv_dct32_odd_4h_x16_neon
+        transpose_4x4h  q15, q14, d31, d30, d29, d28
+        transpose_4x4h  q13, q12, d27, d26, d25, d24
+        transpose_4x4h  q11, q10, d23, d22, d21, d20
+        transpose_4x4h  q9,  q8,  d19, d18, d17, d16
+.macro store2 r0, r1, r2, r3, shift
+        vld1.16         {q0, q1}, [r6, :128]
+        vqsub.s16       d7,  d0,  \r0
+        vqadd.s16       d0,  d0,  \r0
+        vqsub.s16       d6,  d1,  \r1
+        vqadd.s16       d1,  d1,  \r1
+        vqsub.s16       d5,  d2,  \r2
+        vqadd.s16       d2,  d2,  \r2
+        vqsub.s16       d4,  d3,  \r3
+        vqadd.s16       d3,  d3,  \r3
+        vrev64.16       q2,  q2
+        vrev64.16       q3,  q3
+        vrshr.s16       q0,  q0,  #\shift
+        vrshr.s16       q1,  q1,  #\shift
+        vrshr.s16       q2,  q2,  #\shift
+        vrshr.s16       q3,  q3,  #\shift
+        vst1.16         {q0, q1}, [r6, :128]!
+        vst1.16         {q2, q3}, [r6, :128]!
+.endm
+
+        store2          d31, d27, d23, d19, \shift
+        store2          d30, d26, d22, d18, \shift
+        store2          d29, d25, d21, d17, \shift
+        store2          d28, d24, d20, d16, \shift
+.purgem store2
+        pop             {pc}
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_4x32_neon
+        push            {r10-r11,lr}
+        lsl             r8,  r8,  #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+
+        bl              inv_dct_4h_x16_neon
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vst1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        add             r7,  r7,  r8, lsr #1
+
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        vld1.16         {\i}, [r7, :64], r8
+.endr
+        sub             r7,  r7,  r8, lsl #4
+        sub             r7,  r7,  r8, lsr #1
+        bl              inv_dct32_odd_4h_x16_neon
+
+        neg             r9,  r8
+        mov             r10, r6
+.macro combine r0, r1, r2, r3, op, stride
+        vld1.16         {d4},    [r7,  :64], \stride
+        vld1.32         {d2[0]}, [r10, :32], r1
+        vld1.16         {d5},    [r7,  :64],  \stride
+        vld1.32         {d2[1]}, [r10, :32], r1
+        \op\().s16      d4,  d4,  \r0
+        vld1.16         {d6},    [r7,  :64], \stride
+        vld1.32         {d3[0]}, [r10, :32], r1
+        \op\().s16      d5,  d5,  \r1
+        vld1.32         {d3[1]}, [r10, :32], r1
+        vrshr.s16       q2,  q2,  #4
+        \op\().s16      d6,  d6,  \r2
+        vld1.16         {d7},    [r7,  :64], \stride
+        vaddw.u8        q2,  q2,  d2
+        \op\().s16      d7,  d7,  \r3
+        vqmovun.s16     d2,  q2
+        vrshr.s16       q3,  q3,  #4
+        vst1.32         {d2[0]}, [r6,  :32], r1
+        vaddw.u8        q3,  q3,  d3
+        vst1.32         {d2[1]}, [r6,  :32], r1
+        vqmovun.s16     d3,  q3
+        vst1.32         {d3[0]}, [r6,  :32], r1
+        vst1.32         {d3[1]}, [r6,  :32], r1
+.endm
+        combine         d31, d30, d29, d28, vqadd, r8
+        combine         d27, d26, d25, d24, vqadd, r8
+        combine         d23, d22, d21, d20, vqadd, r8
+        combine         d19, d18, d17, d16, vqadd, r8
+        sub             r7,  r7,  r8
+        combine         d16, d17, d18, d19, vqsub, r9
+        combine         d20, d21, d22, d23, vqsub, r9
+        combine         d24, d25, d26, d27, vqsub, r9
+        combine         d28, d29, d30, d31, vqsub, r9
+.purgem combine
+
+        pop             {r10-r11,pc}
+endfunc
+
+const eob_32x32
+        .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+        .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+        .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+        // Contrary to the others, this one is only ever used in increments of 8x8
+        .short 43, 107, 171, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        vmov.i16        q0,  #0
+        movrel_local    r5,  eob_32x32, 2
+
+        mov             r6,  #2*32
+1:
+        mov             r12, #0
+        movrel_local    r4,  eob_32x32, 2
+2:
+        add             r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r6
+.endr
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        load_add_store_8x8 r0, r7, shiftbits=2
+        ldrh            lr,  [r4], #4
+        sub             r0,  r0,  r1, lsl #3
+        cmp             r3,  lr
+        add             r0,  r0,  #8
+        bge             2b
+
+        ldrh            lr,  [r5], #4
+        cmp             r3,  lr
+        blt             9f
+
+        sub             r0,  r0,  r12
+        add             r0,  r0,  r1, lsl #3
+        mls             r2,  r6,  r12, r2
+        add             r2,  r2,  #2*8
+        b               1b
+9:
+        pop             {r4-r7,pc}
+endfunc
+
+.macro shift_8_regs op, shift
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        \op             \i,  \i,  #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        push            {r4-r7,lr}
+        movw            r6,  #2896*8
+        movw            r7,  #2*(5793-4096)*8
+        vdup.i16        d0,  r6
+        movrel_local    r5,  eob_16x32\hshort, 2
+        vmov.16         d0[1], r7
+
+        mov             r6,  #2*\h
+1:
+        mov             r12, #0
+        movrel_local    r4,  eob_16x32\wshort, 2
+2:
+        vmov.i16        q1,  #0
+        add             r12, r12, #8
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q1}, [r2, :128], r6
+.endr
+        scale_input     d0[0], q8,  q9, q10, q11, q12, q13, q14, q15
+
+.if \w == 16
+        // 16x32
+        identity_8x8_shift1 d0[1]
+.else
+        // 32x16
+        shift_8_regs    vqshl.s16, 1
+        identity_8x8    d0[1]
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+.if \w == 16
+        load_add_store_8x8 r0, r7, shiftbits=2
+.else
+        load_add_store_8x8 r0, r7, shiftbits=4
+.endif
+        ldrh            lr,  [r4], #4
+        sub             r0,  r0,  r1, lsl #3
+        cmp             r3,  lr
+        add             r0,  r0,  #8
+        bge             2b
+
+        ldrh            lr,  [r5], #4
+        cmp             r3,  lr
+        blt             9f
+
+        sub             r0,  r0,  r12
+        add             r0,  r0,  r1, lsl #3
+        mls             r2,  r6,  r12, r2
+        add             r2,  r2,  #2*8
+        b               1b
+9:
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
+        push            {r4-r5,lr}
+        vmov.i16        q0,  #0
+        movrel_local    r4,  eob_8x32
+
+        mov             r12, #2*\h
+1:
+        ldrh            lr,  [r4], #2
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r12
+.endr
+
+.if \w == 8
+        // 8x32
+        shift_8_regs    vrshr.s16, 1
+.endif
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        cmp             r3,  lr
+.if \w == 8
+        load_add_store_8x8 r0, r5, shiftbits=2
+.else
+        load_add_store_8x8 r0, r5, shiftbits=3
+.endif
+
+        blt             9f
+.if \w == 8
+        sub             r2,  r2,  r12, lsl #3
+        add             r2,  r2,  #2*8
+.else
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #8
+.endif
+        b               1b
+
+9:
+        pop             {r4-r5,pc}
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
+        idct_dc         32,  32,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    2048
+        movrel_local    r10, eob_32x32
+        ldrh            r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  sp,  #(\i*32*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    2048
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
+        idct_dc         16,  32,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    1024
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+        movrel_local    r4,  inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  sp,  #(\i*16*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        mov             r8,  #2*32
+        bl              inv_txfm_horz_scale_16x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    1024
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
+        idct_dc         32,  16,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+        sub_sp_align    1024
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+        movrel_local    r5,  inv_dct_4h_x16_neon
+
+.irp i, 0, 4, 8, 12
+        add             r6,  sp,  #(\i*32*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.if \i < 12
+        ldrh            r11, [r10], #2
+.endif
+.endif
+        mov             r8,  #2*16
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    1024
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
+        idct_dc         8,   32,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        sub_sp_align    512
+
+        movrel_local    r10, eob_8x32
+
+        mov             r8,  #2*32
+        mov             r9,  #32
+        mov             r6,  sp
+1:
+        vmov.i16        q0,  #0
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r2, :128]
+        vst1.16         {q0}, [r2, :128], r8
+.endr
+        ldrh            r11, [r10], #2
+        sub             r2,  r2,  r8, lsl #3
+        sub             r9,  r9,  #8
+        add             r2,  r2,  #2*8
+
+        bl              inv_dct_8h_x8_neon
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vrshr.s16       \i,  \i,  #2
+.endr
+
+        transpose_8x8h  q8,  q9,  q10, q11, q12, q13, q14, q15, d17, d19, d21, d23, d24, d26, d28, d30
+
+        vst1.16         {q8,  q9},  [r6, :128]!
+        cmp             r3,  r11
+        vst1.16         {q10, q11}, [r6, :128]!
+        vst1.16         {q12, q13}, [r6, :128]!
+        vst1.16         {q14, q15}, [r6, :128]!
+
+        bge             1b
+        cmp             r9,  #0
+        beq             3f
+
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r9,  r9,  #8
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4
+        add             r6,  r0,  #(\i)
+        add             r7,  sp,  #(\i*2)
+        mov             r8,  #8*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    512
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
+        idct_dc         32,  8,   2
+
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        sub_sp_align    512
+
+.irp i, 0, 4
+        add             r6,  sp,  #(\i*32*2)
+        add             r7,  r2,  #(\i*2)
+.if \i > 0
+        cmp             r3,  #10
+        blt             1f
+.endif
+        mov             r8,  #8*2
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               2f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+
+2:
+        mov             r8,  #2*32
+        mov             r9,  #0
+1:
+        add             r6,  r0,  r9
+        add             r7,  sp,  r9, lsl #1 // #(\i*2)
+
+.irp i, q8, q9, q10, q11, q12, q13, q14, q15
+        vld1.16         {\i}, [r7, :128], r8
+.endr
+        add             r9,  r9,  #8
+
+        bl              inv_dct_8h_x8_neon
+
+        cmp             r9,  #32
+
+        load_add_store_8x8 r6, r7
+
+        blt             1b
+
+        add_sp_align    512
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_dct64_step1_neon
+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+        vld1.16         {d0, d1, d2}, [r12, :64]!
+
+        vqrdmulh.s16    d23, d16, d0[1]  // t63a
+        vqrdmulh.s16    d16, d16, d0[0]  // t32a
+        vqrdmulh.s16    d22, d17, d0[2]  // t62a
+        vqrdmulh.s16    d17, d17, d0[3]  // t33a
+        vqrdmulh.s16    d21, d18, d1[1]  // t61a
+        vqrdmulh.s16    d18, d18, d1[0]  // t34a
+        vqrdmulh.s16    d20, d19, d1[2]  // t60a
+        vqrdmulh.s16    d19, d19, d1[3]  // t35a
+
+        vqadd.s16       d24, d16, d17    // t32
+        vqsub.s16       d25, d16, d17    // t33
+        vqsub.s16       d26, d19, d18    // t34
+        vqadd.s16       d27, d19, d18    // t35
+        vqadd.s16       d28, d20, d21    // t60
+        vqsub.s16       d29, d20, d21    // t61
+        vqsub.s16       d30, d23, d22    // t62
+        vqadd.s16       d31, d23, d22    // t63
+
+        vmull_vmlal     q2,  d29, d26, d2[0], d2[1] // -> t34a
+        vmull_vmlsl     q3,  d29, d26, d2[1], d2[0] // -> t61a
+        vneg.s32        q2,  q2                     // t34a
+        vmull_vmlsl     q4,  d30, d25, d2[1], d2[0] // -> t33a
+        vrshrn.i32      d26, q2,  #12               // t34a
+        vmull_vmlal     q2,  d30, d25, d2[0], d2[1] // -> t62a
+        vrshrn.i32      d29, q3,  #12               // t61a
+        vrshrn.i32      d25, q4,  #12               // t33a
+        vrshrn.i32      d30, q2,  #12               // t62a
+
+        vqadd.s16       d16, d24, d27    // t32a
+        vqsub.s16       d19, d24, d27    // t35a
+        vqadd.s16       d17, d25, d26    // t33
+        vqsub.s16       d18, d25, d26    // t34
+        vqsub.s16       d20, d31, d28    // t60a
+        vqadd.s16       d23, d31, d28    // t63a
+        vqsub.s16       d21, d30, d29    // t61
+        vqadd.s16       d22, d30, d29    // t62
+
+        vmull_vmlal     q2,  d21, d18, d2[2], d2[3] // -> t61a
+        vmull_vmlsl     q3,  d21, d18, d2[3], d2[2] // -> t34a
+        vmull_vmlal     q4,  d20, d19, d2[2], d2[3] // -> t60
+        vrshrn.i32      d21, q2,  #12               // t61a
+        vrshrn.i32      d18, q3,  #12               // t34a
+        vmull_vmlsl     q2,  d20, d19, d2[3], d2[2] // -> t35
+        vrshrn.i32      d20, q4,  #12               // t60
+        vrshrn.i32      d19, q2,  #12               // t35
+
+        vst1.16         {d16, d17, d18, d19}, [r6, :128]!
+        vst1.16         {d20, d21, d22, d23}, [r6, :128]!
+
+        bx              lr
+endfunc
+
+function inv_dct64_step2_neon
+        movrel_local    r12, idct_coeffs
+        vld1.16         {d0}, [r12, :64]
+1:
+        // t32a/33/34a/35/60/61a/62/63a
+        // t56a/57/58a/59/36/37a/38/39a
+        // t40a/41/42a/43/52/53a/54/55a
+        // t48a/49/50a/51/44/45a/46/47a
+        vldr            d16, [r6, #2*4*0]  // t32a
+        vldr            d17, [r9, #2*4*8]  // t39a
+        vldr            d18, [r9, #2*4*0]  // t63a
+        vldr            d19, [r6, #2*4*8]  // t56a
+        vldr            d20, [r6, #2*4*16] // t40a
+        vldr            d21, [r9, #2*4*24] // t47a
+        vldr            d22, [r9, #2*4*16] // t55a
+        vldr            d23, [r6, #2*4*24] // t48a
+
+        vqadd.s16       d24, d16, d17      // t32
+        vqsub.s16       d25, d16, d17      // t39
+        vqadd.s16       d26, d18, d19      // t63
+        vqsub.s16       d27, d18, d19      // t56
+        vqsub.s16       d28, d21, d20      // t40
+        vqadd.s16       d29, d21, d20      // t47
+        vqadd.s16       d30, d23, d22      // t48
+        vqsub.s16       d31, d23, d22      // t55
+
+        vmull_vmlal     q2,  d27, d25, d0[3], d0[2] // -> t56a
+        vmull_vmlsl     q3,  d27, d25, d0[2], d0[3] // -> t39a
+        vmull_vmlal     q4,  d31, d28, d0[3], d0[2] // -> t40a
+        vrshrn.i32      d25, q2,  #12               // t56a
+        vrshrn.i32      d27, q3,  #12               // t39a
+        vneg.s32        q4,  q4                     // t40a
+        vmull_vmlsl     q2,  d31, d28, d0[2], d0[3] // -> t55a
+        vrshrn.i32      d31, q4,  #12               // t40a
+        vrshrn.i32      d28, q2,  #12               // t55a
+
+        vqadd.s16       d16, d24, d29      // t32a
+        vqsub.s16       d19, d24, d29      // t47a
+        vqadd.s16       d17, d27, d31      // t39
+        vqsub.s16       d18, d27, d31      // t40
+        vqsub.s16       d20, d26, d30      // t48a
+        vqadd.s16       d23, d26, d30      // t63a
+        vqsub.s16       d21, d25, d28      // t55
+        vqadd.s16       d22, d25, d28      // t56
+
+        vmull_vmlsl     q2,  d21, d18, d0[0], d0[0] // -> t40a
+        vmull_vmlal     q3,  d21, d18, d0[0], d0[0] // -> t55a
+        vmull_vmlsl     q4,  d20, d19, d0[0], d0[0] // -> t47
+        vrshrn.i32      d18, q2,  #12               // t40a
+        vrshrn.i32      d21, q3,  #12               // t55a
+        vmull_vmlal     q2,  d20, d19, d0[0], d0[0] // -> t48
+        vrshrn.i32      d19, q4,  #12               // t47
+        vrshrn.i32      d20, q2,  #12               // t48
+
+        vstr            d16, [r6, #2*4*0]  // t32a
+        vstr            d17, [r9, #2*4*0]  // t39
+        vstr            d18, [r6, #2*4*8]  // t40a
+        vstr            d19, [r9, #2*4*8]  // t47
+        vstr            d20, [r6, #2*4*16] // t48
+        vstr            d21, [r9, #2*4*16] // t55a
+        vstr            d22, [r6, #2*4*24] // t56
+        vstr            d23, [r9, #2*4*24] // t63a
+
+        add             r6,  r6,  #2*4
+        sub             r9,  r9,  #2*4
+        cmp             r6,  r9
+        blt             1b
+        bx              lr
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, d16, d17, d18, d19, d20, d21, d22, d23
+.if \clear
+        vld1.16         {\i}, [\src, :64]
+        vst1.16         {\zero}, [\src, :64], \strd
+.else
+        vld1.16         {\i}, [\src, :64], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+        vst1.16         {q8,  q9},  [\dst, :128]!
+        vst1.16         {q10, q11}, [\dst, :128]!
+        vst1.16         {q12, q13}, [\dst, :128]!
+        vst1.16         {q14, q15}, [\dst, :128]!
+.endm
+
+.macro clear_upper8
+.irp i, q12, q13, q14, q15
+        vmov.i16        \i,  #0
+.endr
+.endm
+
+.macro vmov_if reg, val, cond
+.if \cond
+        vmov.i16        \reg, \val
+.endif
+.endm
+
+.macro movdup_if reg, gpr, val, cond
+.if \cond
+        movw            \gpr, \val
+        vdup.16         \reg, \gpr
+.endif
+.endm
+
+.macro vst1_if regs, dst, dstalign, cond
+.if \cond
+        vst1.16         \regs, \dst, \dstalign
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+        scale_input     \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4h_x64_neon, export=1
+        mov             r6,  sp
+
+        push            {r10-r11,lr}
+
+        lsl             r8,  r8,  #2
+
+        movdup_if       d0,  r12, #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        load8           r7,  r8,  d7,  \clear
+        clear_upper8
+        sub             r7,  r7,  r8, lsl #3
+        add             r7,  r7,  r8, lsr #1
+        scale_if        \scale, d0[0], q8, q9, q10, q11
+
+        bl              inv_dct_4h_x16_neon
+
+        store16         r6
+
+        movdup_if       d0,  r12, #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        load8           r7,  r8,  d7,  \clear
+        clear_upper8
+        sub             r7,  r7,  r8, lsl #3
+        lsr             r8,  r8,  #1
+        sub             r7,  r7,  r8, lsr #1
+        scale_if        \scale, d0[0], q8, q9, q10, q11
+
+        bl              inv_dct32_odd_4h_x16_neon
+
+        add             r10, r6,  #8*15
+        sub             r6,  r6,  #8*16
+
+        mov             r9,  #-8
+
+.macro store_addsub r0, r1, r2, r3
+        vld1.16         {d2},  [r6, :64]!
+        vld1.16         {d3},  [r6, :64]!
+        vqadd.s16       d6,  d2,  \r0
+        vqsub.s16       \r0, d2,  \r0
+        vld1.16         {d4},  [r6, :64]!
+        vqadd.s16       d7,  d3,  \r1
+        vqsub.s16       \r1, d3,  \r1
+        vld1.16         {d5},  [r6, :64]!
+        vqadd.s16       d2,  d4,  \r2
+        sub             r6,  r6,  #8*4
+        vqsub.s16       \r2, d4,  \r2
+        vst1.16         {d6},  [r6,  :64]!
+        vst1.16         {\r0}, [r10, :64], r9
+        vqadd.s16       d3,  d5,  \r3
+        vqsub.s16       \r3, d5,  \r3
+        vst1.16         {d7},  [r6,  :64]!
+        vst1.16         {\r1}, [r10, :64], r9
+        vst1.16         {d2},  [r6,  :64]!
+        vst1.16         {\r2}, [r10, :64], r9
+        vst1.16         {d3},  [r6,  :64]!
+        vst1.16         {\r3}, [r10, :64], r9
+.endm
+        store_addsub    d31, d30, d29, d28
+        store_addsub    d27, d26, d25, d24
+        store_addsub    d23, d22, d21, d20
+        store_addsub    d19, d18, d17, d16
+.purgem store_addsub
+
+        add             r6,  r6,  #2*4*16
+
+        movrel_local    r12, idct64_coeffs
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        add             r9,  r7,  r8, lsl #4 // offset 16
+        add             r10, r7,  r8, lsl #3 // offset 8
+        sub             r9,  r9,  r8         // offset 15
+        sub             r11, r10, r8         // offset 7
+        vld1.16         {d16}, [r7,  :64]    // in1  (offset 0)
+        vld1.16         {d17}, [r9,  :64]    // in31 (offset 15)
+        vld1.16         {d18}, [r10, :64]    // in17 (offset 8)
+        vld1.16         {d19}, [r11, :64]    // in15 (offset 7)
+        vst1_if         {d7},  [r7,  :64], \clear
+        vst1_if         {d7},  [r9,  :64], \clear
+        vst1_if         {d7},  [r10, :64], \clear
+        vst1_if         {d7},  [r11, :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        add             r7,  r7,  r8, lsl #2 // offset 4
+        sub             r9,  r9,  r8, lsl #2 // offset 11
+        sub             r10, r7,  r8         // offset 3
+        add             r11, r9,  r8         // offset 12
+        vld1.16         {d16}, [r10, :64]    // in7  (offset 3)
+        vld1.16         {d17}, [r11, :64]    // in25 (offset 12)
+        vld1.16         {d18}, [r9,  :64]    // in23 (offset 11)
+        vld1.16         {d19}, [r7,  :64]    // in9  (offset 4)
+        vst1_if         {d7},  [r7,  :64], \clear
+        vst1_if         {d7},  [r9,  :64], \clear
+        vst1_if         {d7},  [r10, :64], \clear
+        vst1_if         {d7},  [r11, :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        sub             r10, r10, r8, lsl #1 // offset 1
+        sub             r9,  r9,  r8, lsl #1 // offset 9
+        add             r10, r10, r8         // offset 2
+        add             r9,  r9,  r8         // offset 10
+        add             r7,  r7,  r8         // offset 5
+        add             r11, r11, r8         // offset 13
+        vld1.16         d16, [r10, :64]      // in5  (offset 2)
+        vld1.16         d17, [r11, :64]      // in27 (offset 13)
+        vld1.16         d18, [r9,  :64]      // in21 (offset 10)
+        vld1.16         d19, [r7,  :64]      // in11 (offset 5)
+        vst1_if         d7,  [r10, :64], \clear
+        vst1_if         d7,  [r11, :64], \clear
+        vst1_if         d7,  [r9,  :64], \clear
+        vst1_if         d7,  [r7,  :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+        movdup_if       d0,  lr,  #2896*8, \scale
+        vmov_if         d7,  #0,  \clear
+        sub             r10, r10, r8         // offset 1
+        sub             r9,  r9,  r8         // offset 9
+        add             r11, r11, r8         // offset 14
+        add             r7,  r7,  r8         // offset 6
+        vld1.16         d16, [r10, :64]      // in3  (offset 1)
+        vld1.16         d17, [r11, :64]      // in29 (offset 14)
+        vld1.16         d18, [r9,  :64]      // in19 (offset 9)
+        vld1.16         d19, [r7,  :64]      // in13 (offset 6)
+        vst1_if         d7,  [r10, :64], \clear
+        vst1_if         d7,  [r11, :64], \clear
+        vst1_if         d7,  [r9,  :64], \clear
+        vst1_if         d7,  [r7,  :64], \clear
+        scale_if        \scale, d0[0], q8, q9
+        bl              inv_dct64_step1_neon
+
+        sub             r6,  r6,  #2*4*32
+        add             r9,  r6,  #2*4*7
+
+        bl              inv_dct64_step2_neon
+
+        pop             {r10-r11,pc}
+endfunc
+.endm
+
+def_dct64_func
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+function inv_txfm_horz_dct_64x4_neon
+        vdup.16         q3,  r9
+
+        mov             r7,  sp
+        add             r8,  sp,  #2*4*(64 - 4)
+        add             r9,  r6,  #2*56
+
+        push            {r10-r11,lr}
+
+        mov             r10, #2*64
+        mov             r11, #-2*4*4
+
+1:
+        vld1.16         {d16, d17, d18, d19}, [r7, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r8, :128], r11
+        vld1.16         {d20, d21, d22, d23}, [r7, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r8, :128], r11
+        transpose_4x4h  q8,  q9,  d16, d17, d18, d19
+        transpose_4x4h  q15, q14, d31, d30, d29, d28
+        transpose_4x4h  q10, q11, d20, d21, d22, d23
+        transpose_4x4h  q13, q12, d27, d26, d25, d24
+
+.macro store_addsub src0, src1, src2, src3
+        vqsub.s16       d3,  \src0,  \src1
+        vqsub.s16       d2,  \src2,  \src3
+        vqadd.s16       d0,  \src0,  \src1
+        vqadd.s16       d1,  \src2,  \src3
+        vrshl.s16       q1,  q1,  q3
+        vrshl.s16       q0,  q0,  q3
+        vrev64.16       q1,  q1
+        vst1.16         {q0},  [r6, :128], r10
+        vst1.16         {q1},  [r9, :128], r10
+.endm
+        store_addsub    d16, d31, d20, d27
+        store_addsub    d17, d30, d21, d26
+        store_addsub    d18, d29, d22, d25
+        store_addsub    d19, d28, d23, d24
+.purgem store_addsub
+        sub             r6,  r6,  r10, lsl #2
+        sub             r9,  r9,  r10, lsl #2
+        add             r6,  r6,  #16
+        sub             r9,  r9,  #16
+
+        cmp             r7,  r8
+        blt             1b
+        pop             {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_vert_dct_4x64_neon
+        lsl             r8,  r8,  #1
+
+        mov             r7,  sp
+        add             r8,  sp,  #2*4*(64 - 4)
+        add             r9,  r6,  r1, lsl #6
+        sub             r9,  r9,  r1
+
+        push            {r10-r11,lr}
+
+        neg             r10, r1
+        mov             r11, #-2*4*4
+
+1:
+        vld1.16         {d16, d17, d18, d19}, [r7, :128]!
+        vld1.16         {d28, d29, d30, d31}, [r8, :128], r11
+        vld1.16         {d20, d21, d22, d23}, [r7, :128]!
+        vld1.16         {d24, d25, d26, d27}, [r8, :128], r11
+
+.macro add_dest_addsub src0, src1, src2, src3
+        vld1.32         {d0[0]}, [r6, :32], r1
+        vld1.32         {d1[0]}, [r9, :32], r10
+        vqadd.s16       d4,  \src0,  \src1
+        vld1.32         {d0[1]}, [r6, :32]
+        vqadd.s16       d5,  \src2,  \src3
+        vld1.32         {d1[1]}, [r9, :32]
+        vqsub.s16       d6,  \src0,  \src1
+        vqsub.s16       d7,  \src2,  \src3
+        sub             r6,  r6,  r1
+        sub             r9,  r9,  r10
+        vrshr.s16       q2,  q2,  #4
+        vrshr.s16       q3,  q3,  #4
+        vaddw.u8        q2,  q2,  d0
+        vaddw.u8        q3,  q3,  d1
+        vqmovun.s16     d0,  q2
+        vqmovun.s16     d1,  q3
+        vst1.32         {d0[0]}, [r6, :32], r1
+        vst1.32         {d1[0]}, [r9, :32], r10
+        vst1.32         {d0[1]}, [r6, :32], r1
+        vst1.32         {d1[1]}, [r9, :32], r10
+.endm
+        add_dest_addsub d16, d31, d17, d30
+        add_dest_addsub d18, d29, d19, d28
+        add_dest_addsub d20, d27, d21, d26
+        add_dest_addsub d22, d25, d23, d24
+.purgem add_dest_addsub
+        cmp             r7,  r8
+        blt             1b
+
+        pop             {r10-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
+        idct_dc         64,  64,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_clear_4h_x64_neon
+        add             r6,  r5,  #(\i*64*2)
+        mov             r9,  #-2 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    64*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
+        idct_dc         64,  32,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_clear_scale_4h_x64_neon
+        add             r6,  r5,  #(\i*64*2)
+        mov             r9,  #-1 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r6,  r0,  #(\i)
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_add_vert_dct_4x32_neon
+.endr
+
+        add_sp_align    64*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
+        idct_dc         32,  64,  1
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    32*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_32x32
+        ldrh            r11, [r10], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*32*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+        ldrh            r11, [r10], #2
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    32*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
+        idct_dc         64,  16,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    64*16*2+64*4*2
+        add             r4,  sp,  #64*4*2
+
+        movrel_local    r10, eob_16x32
+
+.irp i, 0, 4, 8, 12
+        add             r6,  r4,  #(\i*64*2)
+.if \i > 0
+        mov             r8,  #(16 - \i)
+        cmp             r3,  r11
+        blt             1f
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_dct_clear_4h_x64_neon
+        add             r6,  r4,  #(\i*64*2)
+        mov             r9,  #-2 // shift
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 8
+        ldrh            r11, [r10], #2
+.endif
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #2
+.rept 8
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+        movrel_local    r5,  inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+        add             r6,  r0,  #(\i)
+        add             r7,  r4,  #(\i*2)
+        mov             r8,  #64*2
+        bl              inv_txfm_add_vert_4x16_neon
+.endr
+
+        add_sp_align    64*16*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
+        idct_dc         16,  64,  2
+
+        push            {r4-r11,lr}
+        vpush           {q4}
+
+        sub_sp_align    16*32*2+64*4*2
+        add             r5,  sp,  #64*4*2
+
+        movrel_local    r10, eob_16x32
+        ldrh            r11, [r10], #2
+
+        movrel_local    r4,  inv_dct_4h_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r6,  r5,  #(\i*16*2)
+.if \i > 0
+        mov             r8,  #(32 - \i)
+        cmp             r3,  r11
+        blt             1f
+        ldrh            r11, [r10], #2
+.endif
+        add             r7,  r2,  #(\i*2)
+        mov             r8,  #32*2
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+
+1:
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+2:
+        subs            r8,  r8,  #4
+.rept 4
+        vst1.16         {q2, q3}, [r6, :128]!
+.endr
+        bgt             2b
+
+3:
+.irp i, 0, 4, 8, 12
+        add             r7,  r5,  #(\i*2)
+        mov             r8,  #16*2
+        bl              inv_txfm_dct_4h_x64_neon
+        add             r6,  r0,  #(\i)
+        bl              inv_txfm_add_vert_dct_4x64_neon
+.endr
+
+        add_sp_align    16*32*2+64*4*2
+        vpop            {q4}
+        pop             {r4-r11,pc}
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/32/mc.S b/ffmpeg/JNI/dav1d/src/arm/32/mc.S
index 36f6c2e2b..47631c071 100644
--- a/ffmpeg/JNI/dav1d/src/arm/32/mc.S
+++ b/ffmpeg/JNI/dav1d/src/arm/32/mc.S
@@ -3168,3 +3168,184 @@ endfunc
 
 warp  , 11
 warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+        push            {r4-r11,lr}
+        ldrd            r4,  r5,  [sp, #36]
+        ldrd            r6,  r7,  [sp, #44]
+        ldrd            r8,  r9,  [sp, #52]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             r12, r3,  #1           // ih - 1
+        cmp             r5,  r3
+        sub             lr,  r2,  #1           // iw - 1
+        it              lt
+        movlt           r12, r5                // min(y, ih - 1)
+        cmp             r4,  r2
+        bic             r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
+        it              lt
+        movlt           lr,  r4                // min(x, iw - 1)
+        bic             lr,  lr,  lr,  asr #31 // max(min(x, iw - 1), 0)
+        mla             r8,  r12, r9,  r8      // ref += iclip() * stride
+        add             r8,  r8,  lr           // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             r10, r5,  r1           // y + bh
+        neg             r5,  r5                // -y
+        sub             r10, r10, r3           // y + bh - ih
+        sub             r12, r1,  #1           // bh - 1
+        cmp             r10, r1
+        bic             r5,  r5,  r5,  asr #31 // max(-y, 0)
+        it              ge
+        movge           r10, r12               // min(y + bh - ih, bh-1)
+        cmp             r5,  r1
+        bic             r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
+        it              ge
+        movge           r5,  r12               // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             r11, r4,  r0           // x + bw
+        neg             r4,  r4                // -x
+        sub             r11, r11, r2           // x + bw - iw
+        sub             lr,  r0,  #1           // bw - 1
+        cmp             r11, r0
+        bic             r4,  r4,  r4,  asr #31 // max(-x, 0)
+        it              ge
+        movge           r11, lr                // min(x + bw - iw, bw-1)
+        cmp             r4,  r0
+        bic             r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
+        it              ge
+        movge           r4,  lr                // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             r1,  r1,  r5           // bh - top_ext
+        mla             r6,  r5,  r7,  r6
+        sub             r2,  r0,  r4           // bw - left_ext
+        sub             r1,  r1,  r10          // center_h = bh - top_ext - bottom_ext
+        sub             r2,  r2,  r11          // center_w = bw - left_ext - right_ext
+
+        mov             r0,  r6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        vld1.8          {d0[]}, [r8]
+        mov             r12, r6                // out = dst
+        mov             r3,  r4
+        vmov            d1,  d0
+1:
+        subs            r3,  r3,  #16
+        vst1.8          {q0}, [r12, :128]!
+        bgt             1b
+.endif
+        mov             lr,  r8
+        add             r12, r6,  r4           // out = dst + left_ext
+        mov             r3,  r2
+1:
+        vld1.8          {q0, q1}, [lr]!
+        subs            r3,  r3,  #32
+.if \need_left
+        vst1.8          {q0, q1}, [r12]!
+.else
+        vst1.8          {q0, q1}, [r12, :128]!
+.endif
+        bgt             1b
+.if \need_right
+        add             r3,  r8,  r2           // in + center_w
+        sub             r3,  r3,  #1           // in + center_w - 1
+        add             r12, r6,  r4           // dst + left_ext
+        vld1.8          {d0[]}, [r3]
+        add             r12, r12, r2           // out = dst + left_ext + center_w
+        mov             r3,  r11
+        vmov            d1,  d0
+1:
+        subs            r3,  r3,  #16
+        vst1.8          {q0}, [r12]!
+        bgt             1b
+.endif
+
+        subs            r1,  r1,  #1           // center_h--
+        add             r6,  r6,  r7
+        add             r8,  r8,  r9
+        bgt             0b
+.endm
+
+        cmp             r4,  #0
+        beq             2f
+        // need_left
+        cmp             r11, #0
+        beq             3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cmp             r11, #0
+        beq             4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+        cmp             r10, #0
+        // Storing the original dst in r0 overwrote bw, recalculate it here
+        add             r2,  r2,  r4           // center_w + left_ext
+        add             r2,  r2,  r11          // bw = center_w + left_ext + right_ext
+
+        beq             3f
+        // need_bottom
+        sub             r8,  r6,  r7           // ref = dst - stride
+        mov             r4,  r2
+1:
+        vld1.8          {q0, q1}, [r8, :128]!
+        mov             r3,  r10
+2:
+        subs            r3,  r3,  #1
+        vst1.8          {q0, q1}, [r6, :128], r7
+        bgt             2b
+        mls             r6,  r7,  r10,  r6     // dst -= bottom_ext * stride
+        subs            r4,  r4,  #32          // bw -= 32
+        add             r6,  r6,  #32          // dst += 32
+        bgt             1b
+
+3:
+        cmp             r5,  #0
+        beq             3f
+        // need_top
+        mls             r6,  r7,  r5,  r0      // dst = stored_dst - top_ext * stride
+1:
+        vld1.8          {q0, q1}, [r0, :128]!
+        mov             r3,  r5
+2:
+        subs            r3,  r3,  #1
+        vst1.8          {q0, q1}, [r6, :128], r7
+        bgt             2b
+        mls             r6,  r7,  r5,  r6      // dst -= top_ext * stride
+        subs            r2,  r2,  #32          // bw -= 32
+        add             r6,  r6,  #32          // dst += 32
+        bgt             1b
+
+3:
+        pop             {r4-r11,pc}
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/32/msac.S b/ffmpeg/JNI/dav1d/src/arm/32/msac.S
new file mode 100644
index 000000000..b06e109dd
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/src/arm/32/msac.S
@@ -0,0 +1,575 @@
+/*
+ * Copyright © 2019, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+#define BUF_POS 0
+#define BUF_END 4
+#define DIF 8
+#define RNG 12
+#define CNT 16
+#define ALLOW_UPDATE_CDF 20
+
+const coeffs
+        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
+        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
+endconst
+
+const bits, align=4
+        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
+        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
+endconst
+
+.macro vld1_align_n d0, q0, q1, src, n
+.if \n == 4
+        vld1.16         {\d0},  [\src, :64]
+.elseif \n == 8
+        vld1.16         {\q0},  [\src, :128]
+.else
+        vld1.16         {\q0, \q1},  [\src, :128]
+.endif
+.endm
+
+.macro vld1_n d0, q0, q1, src, n
+.if \n == 4
+        vld1.16         {\d0},  [\src]
+.elseif \n == 8
+        vld1.16         {\q0},  [\src]
+.else
+        vld1.16         {\q0, \q1},  [\src]
+.endif
+.endm
+
+.macro vst1_align_n d0, q0, q1, src, n
+.if \n == 4
+        vst1.16         {\d0},  [\src, :64]
+.elseif \n == 8
+        vst1.16         {\q0},  [\src, :128]
+.else
+        vst1.16         {\q0, \q1},  [\src, :128]
+.endif
+.endm
+
+.macro vst1_n d0, q0, q1, src, n
+.if \n == 4
+        vst1.16         {\d0},  [\src]
+.elseif \n == 8
+        vst1.16         {\q0},  [\src]
+.else
+        vst1.16         {\q0, \q1},  [\src]
+.endif
+.endm
+
+.macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vshr.u16        \d0,  \s0,  \s3
+.else
+        vshr.u16        \d1,  \s1,  \s4
+.if \n == 16
+        vshr.u16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vadd.i16        \d0,  \s0,  \s3
+.else
+        vadd.i16        \d1,  \s1,  \s4
+.if \n == 16
+        vadd.i16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vsub.i16        \d0,  \s0,  \s3
+.else
+        vsub.i16        \d1,  \s1,  \s4
+.if \n == 16
+        vsub.i16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vand            \d0,  \s0,  \s3
+.else
+        vand            \d1,  \s1,  \s4
+.if \n == 16
+        vand            \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vcge.u16        \d0,  \s0,  \s3
+.else
+        vcge.u16        \d1,  \s1,  \s4
+.if \n == 16
+        vcge.u16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vrhadd.u16      \d0,  \s0,  \s3
+.else
+        vrhadd.u16      \d1,  \s1,  \s4
+.if \n == 16
+        vrhadd.u16      \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vshl.s16        \d0,  \s0,  \s3
+.else
+        vshl.s16        \d1,  \s1,  \s4
+.if \n == 16
+        vshl.s16        \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+.macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
+.if \n == 4
+        vqdmulh.s16     \d0,  \s0,  \s3
+.else
+        vqdmulh.s16     \d1,  \s1,  \s4
+.if \n == 16
+        vqdmulh.s16     \d2,  \s2,  \s5
+.endif
+.endif
+.endm
+
+// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
+//                                               size_t n_symbols);
+
+function msac_decode_symbol_adapt4_neon, export=1
+.macro decode_update n
+        push            {r4-r10,lr}
+        sub             sp,  sp,  #48
+        add             r8,  r0,  #RNG
+
+        vld1_align_n    d0,  q0,  q1,  r1,  \n                         // cdf
+        vld1.16         {d16[]}, [r8, :16]                             // rng
+        movrel_local    r9,  coeffs, 30
+        vmov.i16        d30, #0x7f00                                   // 0x7f00
+        sub             r9,  r9,  r2, lsl #1
+        vmvn.i16        q14, #0x3f                                     // 0xffc0
+        add             r8,  sp,  #14
+        vand            d22, d16, d30                                  // rng & 0x7f00
+        vst1.16         {d16[0]}, [r8, :16]                            // store original u = s->rng
+        vand_n          d4,  q2,  q3,  d0,  q0,  q1, d28, q14, q14, \n // cdf & 0xffc0
+.if \n > 4
+        vmov            d23, d22
+.endif
+
+        vld1_n          d16, q8,  q9,  r9,  \n                          // EC_MIN_PROB * (n_symbols - ret)
+        vqdmulh_n       d20, q10, q11, d4,  q2,  q3,  d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             r8,  r0,  #DIF + 2
+
+        vadd_n          d16, q8,  q9,  d4,  q2,  q3,  d16, q8,  q9,  \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+.if \n == 4
+        vmov.i16        d17, #0
+.endif
+        vadd_n          d16, q8,  q9,  d20, q10, q11, d16, q8,  q9,  \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+
+        add             r9,  sp,  #16
+        vld1.16         {d20[]}, [r8, :16]                              // dif >> (EC_WIN_SIZE - 16)
+        movrel_local    r8,  bits
+        vst1_n          q8,  q8,  q9,  r9,  \n                          // store v values to allow indexed access
+
+        vmov            d21, d20
+        vld1_align_n    q12, q12, q13, r8,  \n
+.if \n == 16
+        vmov            q11, q10
+.endif
+
+        vcge_n          q2,  q2,  q3,  q10, q10, q11, q8,  q8,  q9,  \n // c >= v
+
+        vand_n          q10, q10, q11, q2,  q2,  q3,  q12, q12, q13, \n // One bit per halfword set in the mask
+.if \n == 16
+        vadd.i16        q10, q10, q11
+.endif
+        vadd.i16        d20, d20, d21                                   // Aggregate mask bits
+        ldr             r4,  [r0, #ALLOW_UPDATE_CDF]
+        vpadd.i16       d20, d20, d20
+        lsl             r10, r2,  #1
+        vpadd.i16       d20, d20, d20
+        vmov.u16        r3,  d20[0]
+        cmp             r4,  #0
+        rbit            r3,  r3
+        clz             lr,  r3                                         // ret
+
+        beq             L(renorm)
+        // update_cdf
+        ldrh            r3,  [r1, r10]                                  // count = cdf[n_symbols]
+        vmov.i8         q10, #0xff
+.if \n == 16
+        mov             r4,  #-5
+.else
+        mvn             r12, r2
+        mov             r4,  #-4
+        cmn             r12, #3                                         // set C if n_symbols <= 2
+.endif
+        vrhadd_n        d16, q8,  q9,  d20, q10, q10, d4,  q2,  q3,  \n // i >= val ? -1 : 32768
+.if \n == 16
+        sub             r4,  r4,  r3, lsr #4                            // -((count >> 4) + 5)
+.else
+        lsr             r12, r3,  #4                                    // count >> 4
+        sbc             r4,  r4,  r12                                   // -((count >> 4) + (n_symbols > 2) + 4)
+.endif
+        vsub_n          d16, q8,  q9,  d16, q8,  q9,  d0,  q0,  q1,  \n // (32768 - cdf[i]) or (-1 - cdf[i])
+.if \n == 4
+        vdup.16         d20, r4                                         // -rate
+.else
+        vdup.16         q10, r4                                         // -rate
+.endif
+
+        sub             r3,  r3,  r3, lsr #5                            // count - (count == 32)
+        vsub_n          d0,  q0,  q1,  d0,  q0,  q1,  d4,  q2,  q3,  \n // cdf + (i >= val ? 1 : 0)
+        vshl_n          d16, q8,  q9,  d16, q8,  q9,  d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
+        add             r3,  r3,  #1                                    // count + (count < 32)
+        vadd_n          d0,  q0,  q1,  d0,  q0,  q1,  d16, q8,  q9,  \n // cdf + (32768 - cdf[i]) >> rate
+        vst1_align_n    d0,  q0,  q1,  r1,  \n
+        strh            r3,  [r1, r10]
+.endm
+
+        decode_update   4
+
+L(renorm):
+        add             r8,  sp,  #16
+        add             r8,  r8,  lr, lsl #1
+        ldrh            r3,  [r8]              // v
+        ldrh            r4,  [r8, #-2]         // u
+        ldr             r6,  [r0, #CNT]
+        ldr             r7,  [r0, #DIF]
+        sub             r4,  r4,  r3           // rng = u - v
+        clz             r5,  r4                // clz(rng)
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mvn             r7,  r7                // ~dif
+        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+L(renorm2):
+        lsl             r4,  r4,  r5           // rng << d
+        subs            r6,  r6,  r5           // cnt -= d
+        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        str             r4,  [r0, #RNG]
+        mvn             r7,  r7                // ~dif
+        bhs             9f
+
+        // refill
+        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
+        ldr             r4,  [r0, #BUF_END]    // BUF_END
+        add             r5,  r3,  #4
+        cmp             r5,  r4
+        bgt             2f
+
+        ldr             r3,  [r3]              // next_bits
+        add             r8,  r6,  #23          // shift_bits = cnt + 23
+        add             r6,  r6,  #16          // cnt += 16
+        rev             r3,  r3                // next_bits = bswap(next_bits)
+        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             r8,  r8,  #24          // shift_bits &= 24
+        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
+        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
+        str             r5,  [r0, #BUF_POS]
+        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
+        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
+        eor             r7,  r7,  r3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        rsb             r5,  r6,  #8           // c = 8 - cnt
+3:
+        cmp             r3,  r4
+        bge             4f
+        ldrb            r8,  [r3], #1
+        lsl             r8,  r8,  r5
+        eor             r7,  r7,  r8
+        subs            r5,  r5,  #8
+        bge             3b
+
+4:      // refill_eob_end
+        str             r3,  [r0, #BUF_POS]
+        rsb             r6,  r5,  #8           // cnt = 8 - c
+
+9:
+        str             r6,  [r0, #CNT]
+        str             r7,  [r0, #DIF]
+
+        mov             r0,  lr
+        add             sp,  sp,  #48
+
+        pop             {r4-r10,pc}
+endfunc
+
+function msac_decode_symbol_adapt8_neon, export=1
+        decode_update   8
+        b               L(renorm)
+endfunc
+
+function msac_decode_symbol_adapt16_neon, export=1
+        decode_update   16
+        b               L(renorm)
+endfunc
+
+function msac_decode_hi_tok_neon, export=1
+        push            {r4-r10,lr}
+        vld1.16         {d0},  [r1, :64]       // cdf
+        add             r4,  r0,  #RNG
+        vmov.i16        d31, #0x7f00           // 0x7f00
+        movrel_local    r5,  coeffs, 30-2*3
+        vmvn.i16        d30, #0x3f             // 0xffc0
+        ldrh            r9,  [r1, #6]          // count = cdf[n_symbols]
+        vld1.16         {d1[]},  [r4, :16]     // rng
+        movrel_local    r4,  bits
+        vld1.16         {d29}, [r5]            // EC_MIN_PROB * (n_symbols - ret)
+        add             r5,  r0,  #DIF + 2
+        vld1.16         {q8}, [r4, :128]
+        mov             r2,  #-24
+        vand            d20, d0, d30           // cdf & 0xffc0
+        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
+        vld1.16         {d2[]}, [r5, :16]      // dif >> (EC_WIN_SIZE - 16)
+        sub             sp,  sp,  #48
+        ldr             r6,  [r0, #CNT]
+        ldr             r7,  [r0, #DIF]
+        vmov            d3,  d2
+1:
+        vand            d23, d1,  d31          // rng & 0x7f00
+        vqdmulh.s16     d18, d20, d23          // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             r12, sp,  #14
+        vadd.i16        d6,  d20, d29          // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        vadd.i16        d6,  d18, d6           // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+        vmov.i16        d7,  #0
+        vst1.16         {d1[0]}, [r12, :16]    // store original u = s->rng
+        add             r12, sp,  #16
+        vcge.u16        q2,  q1,  q3           // c >= v
+        vst1.16         {q3},  [r12]           // store v values to allow indexed access
+        vand            q9,  q2,  q8           // One bit per halfword set in the mask
+
+        vadd.i16        d18, d18, d19          // Aggregate mask bits
+        vpadd.i16       d18, d18, d18
+        vpadd.i16       d18, d18, d18
+        vmov.u16        r3,  d18[0]
+        cmp             r10, #0
+        add             r2,  r2,  #5
+        rbit            r3,  r3
+        add             r8,  sp,  #16
+        clz             lr,  r3                // ret
+
+        beq             2f
+        // update_cdf
+        vmov.i8         d22, #0xff
+        mov             r4,  #-5
+        vrhadd.u16      d6,  d22, d4           // i >= val ? -1 : 32768
+        sub             r4,  r4,  r9, lsr #4   // -((count >> 4) + 5)
+        vsub.i16        d6,  d6,  d0           // (32768 - cdf[i]) or (-1 - cdf[i])
+        vdup.16         d18, r4                // -rate
+
+        sub             r9,  r9,  r9, lsr #5   // count - (count == 32)
+        vsub.i16        d0,  d0,  d4           // cdf + (i >= val ? 1 : 0)
+        vshl.s16        d6,  d6,  d18          // ({32768,-1} - cdf[i]) >> rate
+        add             r9,  r9,  #1           // count + (count < 32)
+        vadd.i16        d0,  d0,  d6           // cdf + (32768 - cdf[i]) >> rate
+        vst1.16         {d0},  [r1, :64]
+        vand            d20, d0,  d30          // cdf & 0xffc0
+        strh            r9,  [r1, #6]
+
+2:
+        add             r8,  r8,  lr, lsl #1
+        ldrh            r3,  [r8]              // v
+        ldrh            r4,  [r8, #-2]         // u
+        sub             r4,  r4,  r3           // rng = u - v
+        clz             r5,  r4                // clz(rng)
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mvn             r7,  r7                // ~dif
+        add             r7,  r7,  r3, lsl #16  // ~dif + (v << 16)
+        lsl             r4,  r4,  r5           // rng << d
+        subs            r6,  r6,  r5           // cnt -= d
+        lsl             r7,  r7,  r5           // (~dif + (v << 16)) << d
+        str             r4,  [r0, #RNG]
+        vdup.16         d1,  r4
+        mvn             r7,  r7                // ~dif
+        bhs             9f
+
+        // refill
+        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
+        ldr             r4,  [r0, #BUF_END]    // BUF_END
+        add             r5,  r3,  #4
+        cmp             r5,  r4
+        bgt             2f
+
+        ldr             r3,  [r3]              // next_bits
+        add             r8,  r6,  #23          // shift_bits = cnt + 23
+        add             r6,  r6,  #16          // cnt += 16
+        rev             r3,  r3                // next_bits = bswap(next_bits)
+        sub             r5,  r5,  r8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             r8,  r8,  #24          // shift_bits &= 24
+        lsr             r3,  r3,  r8           // next_bits >>= shift_bits
+        sub             r8,  r8,  r6           // shift_bits -= 16 + cnt
+        str             r5,  [r0, #BUF_POS]
+        lsl             r3,  r3,  r8           // next_bits <<= shift_bits
+        rsb             r6,  r8,  #16          // cnt = cnt + 32 - shift_bits
+        eor             r7,  r7,  r3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        rsb             r5,  r6,  #8           // c = 40 - cnt
+3:
+        cmp             r3,  r4
+        bge             4f
+        ldrb            r8,  [r3], #1
+        lsl             r8,  r8,  r5
+        eor             r7,  r7,  r8
+        subs            r5,  r5,  #8
+        bge             3b
+
+4:      // refill_eob_end
+        str             r3,  [r0, #BUF_POS]
+        rsb             r6,  r5,  #8           // cnt = 40 - c
+
+9:
+        lsl             lr,  lr,  #1
+        sub             lr,  lr,  #5
+        lsr             r12, r7,  #16
+        adds            r2,  r2,  lr           // carry = tok_br < 3 || tok == 15
+        vdup.16         q1,  r12
+        bcc             1b                     // loop if !carry
+        add             r2,  r2,  #30
+        str             r6,  [r0, #CNT]
+        add             sp,  sp,  #48
+        str             r7,  [r0, #DIF]
+        lsr             r0,  r2,  #1
+        pop             {r4-r10,pc}
+endfunc
+
+function msac_decode_bool_equi_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r5,  [r0, #RNG]
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        bic             r4,  r5,  #0xff        // r &= 0xff00
+        add             r4,  r4,  #8
+        mov             r2,  #0
+        subs            r8,  r7,  r4, lsl #15  // dif - vw
+        lsr             r4,  r4,  #1           // v
+        sub             r5,  r5,  r4           // r - v
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r5,  [r0, #RNG]
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        lsr             r4,  r5,  #8           // r >> 8
+        bic             r1,  r1,  #0x3f        // f &= ~63
+        mul             r4,  r4,  r1
+        mov             r2,  #0
+        lsr             r4,  r4,  #7
+        add             r4,  r4,  #4           // v
+        subs            r8,  r7,  r4, lsl #16  // dif - vw
+        sub             r5,  r5,  r4           // r - v
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+        b               L(renorm2)
+endfunc
+
+function msac_decode_bool_adapt_neon, export=1
+        push            {r4-r10,lr}
+        ldr             r9,  [r1]              // cdf[0-1]
+        ldr             r5,  [r0, #RNG]
+        movw            lr,  #0xffc0
+        ldr             r6,  [r0, #CNT]
+        sub             sp,  sp,  #48
+        ldr             r7,  [r0, #DIF]
+        lsr             r4,  r5,  #8           // r >> 8
+        and             r2,  r9,  lr           // f &= ~63
+        mul             r4,  r4,  r2
+        mov             r2,  #0
+        lsr             r4,  r4,  #7
+        add             r4,  r4,  #4           // v
+        subs            r8,  r7,  r4, lsl #16  // dif - vw
+        sub             r5,  r5,  r4           // r - v
+        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
+        itee            lo
+        movlo           r2,  #1
+        movhs           r4,  r5                // if (ret) v = r - v;
+        movhs           r7,  r8                // if (ret) dif = dif - vw;
+
+        cmp             r10, #0
+        clz             r5,  r4                // clz(rng)
+        mvn             r7,  r7                // ~dif
+        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
+        mov             lr,  r2
+
+        beq             L(renorm2)
+
+        lsr             r2,  r9,  #16          // count = cdf[1]
+        uxth            r9,  r9                // cdf[0]
+
+        sub             r3,  r2,  r2,  lsr #5  // count - (count >= 32)
+        lsr             r2,  r2,  #4           // count >> 4
+        add             r10, r3,  #1           // count + (count < 32)
+        add             r2,  r2,  #4           // rate = (count >> 4) | 4
+
+        sub             r9,  r9,  lr           // cdf[0] -= bit
+        sub             r3,  r9,  lr,  lsl #15 // {cdf[0], cdf[0] - 32769}
+        asr             r3,  r3,  r2           // {cdf[0], cdf[0] - 32769} >> rate
+        sub             r9,  r9,  r3           // cdf[0]
+
+        strh            r9,  [r1]
+        strh            r10, [r1, #2]
+
+        b               L(renorm2)
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/32/util.S b/ffmpeg/JNI/dav1d/src/arm/32/util.S
index ea4afc38d..6af0158e0 100644
--- a/ffmpeg/JNI/dav1d/src/arm/32/util.S
+++ b/ffmpeg/JNI/dav1d/src/arm/32/util.S
@@ -84,6 +84,23 @@
         vtrn.8          \r6,  \r7
 .endm
 
+.macro transpose_8x8h r0, r1, r2, r3, r4, r5, r6, r7, d0, d1, d2, d3, d4, d5, d6, d7
+        vswp            \d0,  \d4
+        vswp            \d1,  \d5
+        vswp            \d2,  \d6
+        vswp            \d3,  \d7
+
+        vtrn.32         \r0,  \r2
+        vtrn.32         \r1,  \r3
+        vtrn.32         \r4,  \r6
+        vtrn.32         \r5,  \r7
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+        vtrn.16         \r4,  \r5
+        vtrn.16         \r6,  \r7
+.endm
+
 .macro transpose_4x8b q0, q1, r0, r1, r2, r3
         vtrn.16         \q0,  \q1
 
@@ -91,4 +108,19 @@
         vtrn.8          \r2,  \r3
 .endm
 
+.macro transpose_4x4h q0, q1, r0, r1, r2, r3
+        vtrn.32         \q0,  \q1
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
+.macro transpose_4x8h r0, r1, r2, r3
+        vtrn.32         \r0,  \r2
+        vtrn.32         \r1,  \r3
+
+        vtrn.16         \r0,  \r1
+        vtrn.16         \r2,  \r3
+.endm
+
 #endif /* DAV1D_SRC_ARM_32_UTIL_S */
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/ipred.S b/ffmpeg/JNI/dav1d/src/arm/64/ipred.S
index 9513212b3..e53665a20 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/ipred.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/ipred.S
@@ -28,11 +28,11 @@
 #include "src/arm/asm.S"
 #include "util.S"
 
-// void ipred_dc_128_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int a,
-//                        const int max_width, const int max_height);
-function ipred_dc_128_neon, export=1
+// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_128_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_dc_128_tbl)
         sub             w3,  w3,  #25
@@ -97,11 +97,11 @@ L(ipred_dc_128_tbl):
         .hword L(ipred_dc_128_tbl) -   4b
 endfunc
 
-// void ipred_v_neon(pixel *dst, const ptrdiff_t stride,
-//                   const pixel *const topleft,
-//                   const int width, const int height, const int a,
-//                   const int max_width, const int max_height);
-function ipred_v_neon, export=1
+// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_v_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_v_tbl)
         sub             w3,  w3,  #25
@@ -132,7 +132,7 @@ function ipred_v_neon, export=1
         b.gt            8b
         ret
 160:
-        ld1             {v0.16b}, [x2], #16
+        ld1             {v0.16b}, [x2]
 16:
         st1             {v0.16b}, [x0], x1
         st1             {v0.16b}, [x6], x1
@@ -170,11 +170,11 @@ L(ipred_v_tbl):
         .hword L(ipred_v_tbl) -  40b
 endfunc
 
-// void ipred_h_neon(pixel *dst, const ptrdiff_t stride,
-//                   const pixel *const topleft,
-//                   const int width, const int height, const int a,
-//                   const int max_width, const int max_height);
-function ipred_h_neon, export=1
+// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                        const pixel *const topleft,
+//                        const int width, const int height, const int a,
+//                        const int max_width, const int max_height);
+function ipred_h_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_h_tbl)
         sub             w3,  w3,  #25
@@ -251,11 +251,11 @@ L(ipred_h_tbl):
         .hword L(ipred_h_tbl) -  4b
 endfunc
 
-// void ipred_dc_top_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int a,
-//                        const int max_width, const int max_height);
-function ipred_dc_top_neon, export=1
+// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_dc_top_8bpc_neon, export=1
         clz             w3,  w3
         adr             x5,  L(ipred_dc_top_tbl)
         sub             w3,  w3,  #25
@@ -351,11 +351,11 @@ L(ipred_dc_top_tbl):
         .hword L(ipred_dc_top_tbl) -  40b
 endfunc
 
-// void ipred_dc_left_neon(pixel *dst, const ptrdiff_t stride,
-//                         const pixel *const topleft,
-//                         const int width, const int height, const int a,
-//                         const int max_width, const int max_height);
-function ipred_dc_left_neon, export=1
+// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w3,  w3
         clz             w7,  w4
@@ -472,11 +472,11 @@ L(ipred_dc_left_tbl):
         .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
 endfunc
 
-// void ipred_dc_neon(pixel *dst, const ptrdiff_t stride,
-//                    const pixel *const topleft,
-//                    const int width, const int height, const int a,
-//                    const int max_width, const int max_height);
-function ipred_dc_neon, export=1
+// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_dc_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         add             w7,  w3,  w4             // width + height
         clz             w3,  w3
@@ -608,7 +608,7 @@ L(ipred_dc_w32):
         cmp             w4,  #32
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v2.4h
-        ushl            v0.4h,   v0.4h,   v17.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 8/16/64
         cmp             w4,  #8
@@ -616,10 +616,10 @@ L(ipred_dc_w32):
         mov             w17, #(0x5556/2)
         csel            w16, w16, w17, eq
         dup             v16.4h,  w16
-        sqdmulh         v0.4h,   v0.4h,   v16.4h
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
-        dup             v0.16b,  v0.b[0]
-        dup             v1.16b,  v0.b[0]
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b}, [x0], x1
         st1             {v0.16b, v1.16b}, [x6], x1
@@ -640,10 +640,6 @@ L(ipred_dc_h64):
         add             v0.4h,   v0.4h,   v2.4h
         br              x3
 L(ipred_dc_w64):
-        mov             v1.16b,  v0.16b
-        mov             v2.16b,  v0.16b
-        mov             v3.16b,  v0.16b
-2:
         add             x2,  x2,  #1
         ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
         add             v0.4h,   v0.4h,   v16.4h
@@ -656,19 +652,19 @@ L(ipred_dc_w64):
         cmp             w4,  #64
         add             v0.4h,   v0.4h,   v1.4h
         add             v0.4h,   v0.4h,   v3.4h
-        ushl            v0.4h,   v0.4h,   v17.4h
+        ushl            v4.4h,   v0.4h,   v17.4h
         b.eq            1f
         // h = 16/32
         mov             w16, #(0x5556/2)
         movk            w16, #(0x3334/2), lsl #16
         lsr             w16, w16, w4
         dup             v16.4h,  w16
-        sqdmulh         v0.4h,   v0.4h,   v16.4h
+        sqdmulh         v4.4h,   v4.4h,   v16.4h
 1:
-        dup             v0.16b,  v0.b[0]
-        dup             v1.16b,  v0.b[0]
-        dup             v2.16b,  v0.b[0]
-        dup             v3.16b,  v0.b[0]
+        dup             v0.16b,  v4.b[0]
+        dup             v1.16b,  v4.b[0]
+        dup             v2.16b,  v4.b[0]
+        dup             v3.16b,  v4.b[0]
 2:
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
         st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
@@ -691,11 +687,11 @@ L(ipred_dc_tbl):
         .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
 endfunc
 
-// void ipred_paeth_neon(pixel *dst, const ptrdiff_t stride,
-//                       const pixel *const topleft,
-//                       const int width, const int height, const int a,
-//                       const int max_width, const int max_height);
-function ipred_paeth_neon, export=1
+// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                            const pixel *const topleft,
+//                            const int width, const int height, const int a,
+//                            const int max_width, const int max_height);
+function ipred_paeth_8bpc_neon, export=1
         clz             w9,  w3
         adr             x5,  L(ipred_paeth_tbl)
         sub             w9,  w9,  #25
@@ -868,11 +864,11 @@ L(ipred_paeth_tbl):
         .hword L(ipred_paeth_tbl) -  40b
 endfunc
 
-// void ipred_smooth_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int a,
-//                        const int max_width, const int max_height);
-function ipred_smooth_neon, export=1
+// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_smooth_8bpc_neon, export=1
         movrel          x10, X(sm_weights)
         add             x11, x10, w4, uxtw
         add             x10, x10, w3, uxtw
@@ -1046,11 +1042,11 @@ L(ipred_smooth_tbl):
         .hword L(ipred_smooth_tbl) -  40b
 endfunc
 
-// void ipred_smooth_v_neon(pixel *dst, const ptrdiff_t stride,
-//                          const pixel *const topleft,
-//                          const int width, const int height, const int a,
-//                          const int max_width, const int max_height);
-function ipred_smooth_v_neon, export=1
+// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_v_8bpc_neon, export=1
         movrel          x7,  X(sm_weights)
         add             x7,  x7,  w4, uxtw
         clz             w9,  w3
@@ -1184,11 +1180,11 @@ L(ipred_smooth_v_tbl):
         .hword L(ipred_smooth_v_tbl) -  40b
 endfunc
 
-// void ipred_smooth_h_neon(pixel *dst, const ptrdiff_t stride,
-//                          const pixel *const topleft,
-//                          const int width, const int height, const int a,
-//                          const int max_width, const int max_height);
-function ipred_smooth_h_neon, export=1
+// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_smooth_h_8bpc_neon, export=1
         movrel          x8,  X(sm_weights)
         add             x8,  x8,  w3, uxtw
         clz             w9,  w3
@@ -1327,11 +1323,11 @@ L(ipred_smooth_h_tbl):
         .hword L(ipred_smooth_h_tbl) -  40b
 endfunc
 
-// void ipred_filter_neon(pixel *dst, const ptrdiff_t stride,
-//                        const pixel *const topleft,
-//                        const int width, const int height, const int filt_idx,
-//                        const int max_width, const int max_height);
-function ipred_filter_neon, export=1
+// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int filt_idx,
+//                             const int max_width, const int max_height);
+function ipred_filter_8bpc_neon, export=1
         and             w5,  w5,  #511
         movrel          x6,  X(filter_intra_taps)
         lsl             w5,  w5,  #6
@@ -1487,10 +1483,10 @@ L(ipred_filter_tbl):
         .hword L(ipred_filter_tbl) -  40b
 endfunc
 
-// void pal_pred_neon(pixel *dst, const ptrdiff_t stride,
-//                    const uint16_t *const pal, const uint8_t *idx,
-//                    const int w, const int h);
-function pal_pred_neon, export=1
+// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const uint16_t *const pal, const uint8_t *idx,
+//                         const int w, const int h);
+function pal_pred_8bpc_neon, export=1
         ld1             {v0.8h}, [x2]
         clz             w9,  w4
         adr             x6,  L(pal_pred_tbl)
@@ -1578,11 +1574,11 @@ L(pal_pred_tbl):
         .hword L(pal_pred_tbl) -  4b
 endfunc
 
-// void ipred_cfl_128_neon(pixel *dst, const ptrdiff_t stride,
-//                         const pixel *const topleft,
-//                         const int width, const int height,
-//                         const int16_t *ac, const int alpha);
-function ipred_cfl_128_neon, export=1
+// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_128_8bpc_neon, export=1
         clz             w9,  w3
         adr             x7,  L(ipred_cfl_128_tbl)
         sub             w9,  w9,  #26
@@ -1699,11 +1695,11 @@ L(ipred_cfl_splat_tbl):
         .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
 endfunc
 
-// void ipred_cfl_top_neon(pixel *dst, const ptrdiff_t stride,
-//                         const pixel *const topleft,
-//                         const int width, const int height,
-//                         const int16_t *ac, const int alpha);
-function ipred_cfl_top_neon, export=1
+// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height,
+//                              const int16_t *ac, const int alpha);
+function ipred_cfl_top_8bpc_neon, export=1
         clz             w9,  w3
         adr             x7,  L(ipred_cfl_top_tbl)
         sub             w9,  w9,  #26
@@ -1717,19 +1713,19 @@ function ipred_cfl_top_neon, export=1
 4:
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w4)
 8:
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w8)
 16:
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
-        urshr           v0.8h,   v0.8h,   #4
+        urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)
 32:
@@ -1737,7 +1733,7 @@ function ipred_cfl_top_neon, export=1
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
-        urshr           v2.8h,   v2.8h,   #5
+        urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         b               L(ipred_cfl_splat_w16)
 
@@ -1748,11 +1744,11 @@ L(ipred_cfl_top_tbl):
         .hword L(ipred_cfl_top_tbl) -  4b
 endfunc
 
-// void ipred_cfl_left_neon(pixel *dst, const ptrdiff_t stride,
-//                          const pixel *const topleft,
-//                          const int width, const int height,
-//                          const int16_t *ac, const int alpha);
-function ipred_cfl_left_neon, export=1
+// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha);
+function ipred_cfl_left_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         clz             w9,  w3
         clz             w8,  w4
@@ -1772,21 +1768,21 @@ function ipred_cfl_left_neon, export=1
 L(ipred_cfl_left_h4):
         ld1r            {v0.2s},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h8):
         ld1             {v0.8b},  [x2]
         uaddlv          h0,      v0.8b
-        urshr           v0.8h,   v0.8h,   #3
+        urshr           v0.4h,   v0.4h,   #3
         dup             v0.8h,   v0.h[0]
         br              x9
 
 L(ipred_cfl_left_h16):
         ld1             {v0.16b}, [x2]
         uaddlv          h0,      v0.16b
-        urshr           v0.8h,   v0.8h,   #4
+        urshr           v0.4h,   v0.4h,   #4
         dup             v0.8h,   v0.h[0]
         br              x9
 
@@ -1795,7 +1791,7 @@ L(ipred_cfl_left_h32):
         uaddlv          h2,      v2.16b
         uaddlv          h3,      v3.16b
         add             v2.4h,   v2.4h,   v3.4h
-        urshr           v2.8h,   v2.8h,   #5
+        urshr           v2.4h,   v2.4h,   #5
         dup             v0.8h,   v2.h[0]
         br              x9
 
@@ -1806,11 +1802,11 @@ L(ipred_cfl_left_tbl):
         .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
 endfunc
 
-// void ipred_cfl_neon(pixel *dst, const ptrdiff_t stride,
-//                     const pixel *const topleft,
-//                     const int width, const int height,
-//                     const int16_t *ac, const int alpha);
-function ipred_cfl_neon, export=1
+// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const pixel *const topleft,
+//                          const int width, const int height,
+//                          const int16_t *ac, const int alpha);
+function ipred_cfl_8bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw
         add             w8,  w3,  w4             // width + height
         dup             v1.8h,   w6              // alpha
@@ -1946,15 +1942,19 @@ L(ipred_cfl_tbl):
         .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
 endfunc
 
-// void cfl_ac_420_neon(int16_t *const ac, const pixel *const ypx,
-//                      const ptrdiff_t stride, const int w_pad,
-//                      const int h_pad, const int cw, const int ch);
-function ipred_cfl_ac_420_neon, export=1
+// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
         adr             x7,  L(ipred_cfl_ac_420_tbl)
         sub             w8,  w8,  #27
         ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
         sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
@@ -1962,14 +1962,10 @@ function ipred_cfl_ac_420_neon, export=1
         clz             w9,  w9              // ctz(width)
         clz             w10, w10             // ctz(height)
         add             w9,  w9,  w10        // log2sz
-        movi            v16.4s,  #1
         add             x10, x1,  x2
+        dup             v31.4s,  w9
         lsl             x2,  x2,  #1
-        dup             v17.4s,  w9
-        sshl            v16.4s,  v16.4s,  v17.4s // 1 << log2sz
-        neg             v17.4s,  v17.4s          // -log2sz
-        ushr            v16.4s,  v16.4s,  #1     // 1 << (log2sz - 1)
-        mov             w9,  w6
+        neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_420_w4):
@@ -1984,6 +1980,7 @@ L(ipred_cfl_ac_420_w4):
         shl             v0.8h,   v0.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h}, [x0], #16
+        add             v16.8h,  v16.8h,  v0.8h
         b.gt            1b
         trn2            v1.2d,   v0.2d,   v0.2d
         trn2            v0.2d,   v0.2d,   v0.2d
@@ -1992,29 +1989,19 @@ L(ipred_cfl_ac_420_w4_hpad):
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
         b.gt            2b
 3:
-        sub             x0,  x0,  w6, uxtw #3
-        // Sum the produced ac values
-        subs            w6,  w6,  #4
-        ld1             {v0.8h, v1.8h}, [x0], #32
-        b.le            5f
-4:
-        ld1             {v2.8h, v3.8h}, [x0], #32
-        subs            w6,  w6,  #4
-        add             v0.8h,   v0.8h,   v2.8h
-        add             v1.8h,   v1.8h,   v3.8h
-        b.gt            4b
-5:
-        add             v0.8h,   v0.8h,   v1.8h
+        // Aggregate the sums
+        add             v0.8h,   v16.8h,  v17.8h
         uaddlv          s0,  v0.8h                // sum
-        sub             x0,  x0,  w9, uxtw #3
-        add             v0.2s,   v0.2s,   v16.2s  // sum += 1 << (log2sz - 1)
-        ushl            v4.2s,   v0.2s,   v17.2s  // sum >>= log2sz
+        sub             x0,  x0,  w6, uxtw #3
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h}, [x0]
-        subs            w9,  w9,  #4
+        subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         st1             {v0.8h, v1.8h}, [x0], #32
@@ -2038,6 +2025,8 @@ L(ipred_cfl_ac_420_w8):
         shl             v1.8h,   v2.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
         b.gt            1b
         mov             v0.16b,  v1.16b
         b               L(ipred_cfl_ac_420_w8_hpad)
@@ -2057,6 +2046,10 @@ L(ipred_cfl_ac_420_w8_wpad):
         trn2            v2.2d,   v0.2d,   v0.2d
         subs            w8,  w8,  #2
         st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        add             v16.4h,  v16.4h,  v0.4h
+        add             v17.4h,  v17.4h,  v1.4h
+        add             v18.4h,  v18.4h,  v2.4h
+        add             v19.4h,  v19.4h,  v3.4h
         b.gt            1b
         trn1            v0.2d,   v2.2d,   v3.2d
         trn1            v1.2d,   v2.2d,   v3.2d
@@ -2066,38 +2059,28 @@ L(ipred_cfl_ac_420_w8_hpad):
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h}, [x0], #32
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
+        add             v18.8h,  v18.8h,  v0.8h
+        add             v19.8h,  v19.8h,  v1.8h
         b.gt            2b
 3:
 
 L(ipred_cfl_ac_420_w8_calc_subtract_dc):
-        sub             x0,  x0,  w6, uxtw #4
-        // Sum the produced ac values
-        subs            w6,  w6,  #4
-        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
-        b.le            5f
-4:
-        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
-        subs            w6,  w6,  #4
-        add             v0.8h,   v0.8h,   v4.8h
-        add             v1.8h,   v1.8h,   v5.8h
-        add             v2.8h,   v2.8h,   v6.8h
-        add             v3.8h,   v3.8h,   v7.8h
-        b.gt            4b
-5:
-        add             v0.8h,   v0.8h,   v1.8h
-        add             v2.8h,   v2.8h,   v3.8h
+        // Aggregate the sums
+        add             v0.8h,   v16.8h,  v17.8h
+        add             v2.8h,   v18.8h,  v19.8h
         uaddlp          v0.4s,   v0.8h
         uaddlp          v2.4s,   v2.8h
         add             v0.4s,   v0.4s,   v2.4s
         addv            s0,  v0.4s                // sum
-        sub             x0,  x0,  w9, uxtw #4
-        add             v0.2s,   v0.2s,   v16.2s  // sum += 1 << (log2sz - 1)
-        ushl            v4.2s,   v0.2s,   v17.2s  // sum >>= log2sz
+        sub             x0,  x0,  w6, uxtw #4
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
         dup             v4.8h,   v4.h[0]
 6:      // Subtract dc from ac
         ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
-        subs            w9,  w9,  #4
+        subs            w6,  w6,  #4
         sub             v0.8h,   v0.8h,   v4.8h
         sub             v1.8h,   v1.8h,   v4.8h
         sub             v2.8h,   v2.8h,   v4.8h
@@ -2136,6 +2119,10 @@ L(ipred_cfl_ac_420_w16_wpad0):
         shl             v3.8h,   v5.8h,   #1
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2173,6 +2160,10 @@ L(ipred_cfl_ac_420_w16_wpad1):
         trn1            v3.2d,   v3.2d,   v5.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2196,6 +2187,10 @@ L(ipred_cfl_ac_420_w16_wpad2):
         dup             v3.8h,   v2.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2221,6 +2216,10 @@ L(ipred_cfl_ac_420_w16_wpad3):
         trn1            v2.2d,   v2.2d,   v3.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2231,7 +2230,15 @@ L(ipred_cfl_ac_420_w16_hpad):
 2:      // Vertical padding (h_pad > 0)
         subs            w4,  w4,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            2b
 3:
 
@@ -2253,15 +2260,19 @@ L(ipred_cfl_ac_420_w16_tbl):
         .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
 endfunc
 
-// void cfl_ac_422_neon(int16_t *const ac, const pixel *const ypx,
-//                      const ptrdiff_t stride, const int w_pad,
-//                      const int h_pad, const int cw, const int ch);
-function ipred_cfl_ac_422_neon, export=1
+// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                           const ptrdiff_t stride, const int w_pad,
+//                           const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_8bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
         adr             x7,  L(ipred_cfl_ac_422_tbl)
         sub             w8,  w8,  #27
         ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
         sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
@@ -2269,14 +2280,10 @@ function ipred_cfl_ac_422_neon, export=1
         clz             w9,  w9              // ctz(width)
         clz             w10, w10             // ctz(height)
         add             w9,  w9,  w10        // log2sz
-        movi            v16.4s,  #1
         add             x10, x1,  x2
+        dup             v31.4s,  w9
         lsl             x2,  x2,  #1
-        dup             v17.4s,  w9
-        sshl            v16.4s,  v16.4s,  v17.4s // 1 << log2sz
-        neg             v17.4s,  v17.4s          // -log2sz
-        ushr            v16.4s,  v16.4s,  #1     // 1 << (log2sz - 1)
-        mov             w9,  w6
+        neg             v31.4s,  v31.4s      // -log2sz
         br              x7
 
 L(ipred_cfl_ac_422_w4):
@@ -2290,6 +2297,8 @@ L(ipred_cfl_ac_422_w4):
         shl             v0.8h,   v0.8h,   #2
         shl             v1.8h,   v1.8h,   #2
         subs            w8,  w8,  #4
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
         st1             {v0.8h, v1.8h}, [x0], #32
         b.gt            1b
         trn2            v0.2d,   v1.2d,   v1.2d
@@ -2313,6 +2322,10 @@ L(ipred_cfl_ac_422_w8):
         shl             v3.8h,   v3.8h,   #2
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
@@ -2338,6 +2351,10 @@ L(ipred_cfl_ac_422_w8_wpad):
         trn1            v2.2d,   v2.2d,   v6.2d
         subs            w8,  w8,  #4
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v3.16b
         mov             v1.16b,  v3.16b
@@ -2363,6 +2380,10 @@ L(ipred_cfl_ac_422_w16_wpad0):
         shl             v3.8h,   v3.8h,   #2
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2388,6 +2409,10 @@ L(ipred_cfl_ac_422_w16_wpad1):
         trn1            v3.2d,   v3.2d,   v5.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2405,6 +2430,10 @@ L(ipred_cfl_ac_422_w16_wpad2):
         dup             v3.8h,   v2.h[7]
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
@@ -2424,6 +2453,10 @@ L(ipred_cfl_ac_422_w16_wpad3):
         trn1            v2.2d,   v2.2d,   v3.2d
         subs            w8,  w8,  #2
         st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        add             v16.8h,  v16.8h,  v0.8h
+        add             v17.8h,  v17.8h,  v1.8h
+        add             v18.8h,  v18.8h,  v2.8h
+        add             v19.8h,  v19.8h,  v3.8h
         b.gt            1b
         mov             v0.16b,  v2.16b
         mov             v1.16b,  v3.16b
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S b/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S
new file mode 100644
index 000000000..5c139490f
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/src/arm/64/ipred16.S
@@ -0,0 +1,2834 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2019, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height,
+//                              const int bitdepth_max);
+function ipred_dc_128_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_128_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        dup             v0.8h,   w8
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        urshr           v0.8h,   v0.8h,  #1
+        br              x5
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        mov             v1.16b,  v0.16b
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+        sub             x1,  x1,  #64
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_128_tbl):
+        .hword L(ipred_dc_128_tbl) - 640b
+        .hword L(ipred_dc_128_tbl) - 320b
+        .hword L(ipred_dc_128_tbl) - 160b
+        .hword L(ipred_dc_128_tbl) -   8b
+        .hword L(ipred_dc_128_tbl) -   4b
+endfunc
+
+// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_v_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_v_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1             {v0.4h},  [x2]
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8h},  [x2]
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.8h, v1.8h}, [x2]
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        sub             x1,  x1,  #64
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_v_tbl):
+        .hword L(ipred_v_tbl) - 640b
+        .hword L(ipred_v_tbl) - 320b
+        .hword L(ipred_v_tbl) - 160b
+        .hword L(ipred_v_tbl) -  80b
+        .hword L(ipred_v_tbl) -  40b
+endfunc
+
+// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                         const pixel *const topleft,
+//                         const int width, const int height, const int a,
+//                         const int max_width, const int max_height);
+function ipred_h_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_h_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        sub             x2,  x2,  #8
+        sub             x5,  x5,  w3, uxtw
+        mov             x7,  #-8
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+4:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        st1             {v3.4h},  [x0], x1
+        st1             {v2.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        st1             {v3.8h},  [x0], x1
+        st1             {v2.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v1.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+16:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            16b
+        ret
+32:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0, #32]
+        stp             q2,  q2,  [x6, #32]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0, #32]
+        stp             q0,  q0,  [x6, #32]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            32b
+        ret
+64:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+        str             q3,  [x0, #16]
+        str             q2,  [x6, #16]
+        stp             q3,  q3,  [x0, #32]
+        stp             q2,  q2,  [x6, #32]
+        stp             q3,  q3,  [x0, #64]
+        stp             q2,  q2,  [x6, #64]
+        stp             q3,  q3,  [x0, #96]
+        stp             q2,  q2,  [x6, #96]
+        st1             {v3.8h}, [x0], x1
+        st1             {v2.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        str             q1,  [x0, #16]
+        str             q0,  [x6, #16]
+        stp             q1,  q1,  [x0, #32]
+        stp             q0,  q0,  [x6, #32]
+        stp             q1,  q1,  [x0, #64]
+        stp             q0,  q0,  [x6, #64]
+        stp             q1,  q1,  [x0, #96]
+        stp             q0,  q0,  [x6, #96]
+        st1             {v1.8h}, [x0], x1
+        st1             {v0.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_h_tbl):
+        .hword L(ipred_h_tbl) - 64b
+        .hword L(ipred_h_tbl) - 32b
+        .hword L(ipred_h_tbl) - 16b
+        .hword L(ipred_h_tbl) -  8b
+        .hword L(ipred_h_tbl) -  4b
+endfunc
+
+// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_dc_top_16bpc_neon, export=1
+        clz             w3,  w3
+        adr             x5,  L(ipred_dc_top_tbl)
+        sub             w3,  w3,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w3, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.4h,   v0.h[0]
+4:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+8:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            8b
+        ret
+160:
+        ld1             {v0.8h, v1.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addv            h0,      v0.8h
+        urshr           v2.4h,   v0.4h,   #4
+        dup             v0.8h,   v2.h[0]
+        dup             v1.8h,   v2.h[0]
+16:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            16b
+        ret
+320:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #5
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+32:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            32b
+        ret
+640:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #6
+        sub             x1,  x1,  #64
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+64:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            64b
+        ret
+
+L(ipred_dc_top_tbl):
+        .hword L(ipred_dc_top_tbl) - 640b
+        .hword L(ipred_dc_top_tbl) - 320b
+        .hword L(ipred_dc_top_tbl) - 160b
+        .hword L(ipred_dc_top_tbl) -  80b
+        .hword L(ipred_dc_top_tbl) -  40b
+endfunc
+
+// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height, const int a,
+//                               const int max_width, const int max_height);
+function ipred_dc_left_16bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw #1
+        clz             w3,  w3
+        clz             w7,  w4
+        adr             x5,  L(ipred_dc_left_tbl)
+        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
+        sub             w7,  w7,  #25
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w7,  [x5, w7, uxtw #1]
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w7, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_left_h4):
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        br              x3
+L(ipred_dc_left_w4):
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            L(ipred_dc_left_w4)
+        ret
+
+L(ipred_dc_left_h8):
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x3
+L(ipred_dc_left_w8):
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            L(ipred_dc_left_w8)
+        ret
+
+L(ipred_dc_left_h16):
+        ld1             {v0.8h, v1.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addv            h0,      v0.8h
+        urshr           v2.4h,   v0.4h,   #4
+        dup             v0.8h,   v2.h[0]
+        dup             v1.8h,   v2.h[0]
+        br              x3
+L(ipred_dc_left_w16):
+        mov             v1.16b,  v0.16b
+1:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_h32):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlp          v0.4s,   v0.8h
+        addv            s0,      v0.4s
+        rshrn           v4.4h,   v0.4s,   #5
+        dup             v0.8h,   v4.h[0]
+        br              x3
+L(ipred_dc_left_w32):
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+1:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_h64):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v4.4h,   v0.4s,   #6
+        dup             v0.8h,   v4.h[0]
+        br              x3
+L(ipred_dc_left_w64):
+        mov             v1.16b,  v0.16b
+        mov             v2.16b,  v0.16b
+        mov             v3.16b,  v0.16b
+        sub             x1,  x1,  #64
+1:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            1b
+        ret
+
+L(ipred_dc_left_tbl):
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
+        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+endfunc
+
+// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const pixel *const topleft,
+//                          const int width, const int height, const int a,
+//                          const int max_width, const int max_height);
+function ipred_dc_16bpc_neon, export=1
+        sub             x2,  x2,  w4, uxtw #1
+        add             w7,  w3,  w4             // width + height
+        clz             w3,  w3
+        clz             w6,  w4
+        dup             v16.4s, w7               // width + height
+        adr             x5,  L(ipred_dc_tbl)
+        rbit            w7,  w7                  // rbit(width + height)
+        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
+        sub             w6,  w6,  #25
+        clz             w7,  w7                  // ctz(width + height)
+        ldrh            w3,  [x5, w3, uxtw #1]
+        ldrh            w6,  [x5, w6, uxtw #1]
+        neg             w7,  w7                  // -ctz(width + height)
+        sub             x3,  x5,  w3, uxtw
+        sub             x5,  x5,  w6, uxtw
+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
+        dup             v17.4s,  w7              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+
+L(ipred_dc_h4):
+        ld1             {v0.4h},  [x2], #8
+        uaddlv          s0,      v0.4h
+        br              x3
+L(ipred_dc_w4):
+        add             x2,  x2,  #2
+        ld1             {v1.4h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s1,      v1.4h
+        cmp             w4,  #4
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.4h,   v0.h[0]
+2:
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.4h},  [x0], x1
+        st1             {v0.4h},  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h8):
+        ld1             {v0.8h},  [x2], #16
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w8):
+        add             x2,  x2,  #2
+        ld1             {v1.8h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #8
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+2:
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h},  [x0], x1
+        st1             {v0.8h},  [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h16):
+        ld1             {v0.8h, v1.8h}, [x2], #32
+        addp            v0.8h,   v0.8h,   v1.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w16):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #16
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/8/32/64
+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], x1
+        st1             {v0.8h, v1.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h32):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w32):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        addp            v3.8h,   v3.8h,   v4.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #32
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16/64
+        cmp             w4,  #8
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_h64):
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v4.8h,   v4.8h,   v6.8h
+        addp            v0.8h,   v0.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        br              x3
+L(ipred_dc_w64):
+        add             x2,  x2,  #2
+        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v1.8h,   v1.8h,   v2.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
+        addp            v3.8h,   v3.8h,   v4.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v22.8h,  v22.8h,  v23.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        addp            v20.8h,  v20.8h,  v22.8h
+        addp            v1.8h,   v1.8h,   v20.8h
+        uaddlv          s1,      v1.8h
+        cmp             w4,  #64
+        add             v0.2s,   v0.2s,   v1.2s
+        ushl            v4.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 16/32
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v4.2s,   v4.2s,   v16.2s
+        ushr            v4.2s,   v4.2s,   #17
+1:
+        sub             x1,  x1,  #64
+        dup             v0.8h,   v4.h[0]
+        dup             v1.8h,   v4.h[0]
+        dup             v2.8h,   v4.h[0]
+        dup             v3.8h,   v4.h[0]
+2:
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
+        b.gt            2b
+        ret
+
+L(ipred_dc_tbl):
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
+        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+endfunc
+
+// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                             const pixel *const topleft,
+//                             const int width, const int height, const int a,
+//                             const int max_width, const int max_height);
+function ipred_paeth_16bpc_neon, export=1
+        clz             w9,  w3
+        adr             x5,  L(ipred_paeth_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x2]
+        add             x8,  x2,  #2
+        sub             x2,  x2,  #8
+        sub             x5,  x5,  w9, uxtw
+        mov             x7,  #-8
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v5.2d},  [x8]
+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
+        zip1            v0.2d,   v0.2d,   v1.2d
+        zip1            v2.2d,   v2.2d,   v3.2d
+        add             v16.8h,  v6.8h,   v0.8h   // base
+        add             v17.8h,  v6.8h,   v2.8h
+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
+        sabd            v21.8h,  v5.8h,   v17.8h
+        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
+        sabd            v23.8h,  v4.8h,   v17.8h
+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
+        sabd            v17.8h,  v2.8h,   v17.8h
+        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
+        umin            v19.8h,  v21.8h,  v23.8h
+        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
+        cmge            v21.8h,  v23.8h,  v21.8h
+        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
+        cmge            v17.8h,  v19.8h,  v17.8h
+        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
+        bit             v20.16b, v0.16b,  v16.16b
+        st1             {v21.d}[1], [x0], x1
+        st1             {v21.d}[0], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v20.d}[1], [x0], x1
+        st1             {v20.d}[0], [x6], x1
+        b.gt            4b
+        ret
+80:
+160:
+320:
+640:
+        ld1             {v5.8h},  [x8], #16
+        mov             w9,  w3
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+1:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
+2:
+        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
+        add             v16.8h,  v6.8h,   v0.8h   // base
+        add             v17.8h,  v6.8h,   v1.8h
+        add             v18.8h,  v6.8h,   v2.8h
+        add             v19.8h,  v6.8h,   v3.8h
+        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
+        sabd            v21.8h,  v5.8h,   v17.8h
+        sabd            v22.8h,  v5.8h,   v18.8h
+        sabd            v23.8h,  v5.8h,   v19.8h
+        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
+        sabd            v25.8h,  v4.8h,   v17.8h
+        sabd            v26.8h,  v4.8h,   v18.8h
+        sabd            v27.8h,  v4.8h,   v19.8h
+        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
+        sabd            v17.8h,  v1.8h,   v17.8h
+        sabd            v18.8h,  v2.8h,   v18.8h
+        sabd            v19.8h,  v3.8h,   v19.8h
+        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
+        umin            v29.8h,  v21.8h,  v25.8h
+        umin            v30.8h,  v22.8h,  v26.8h
+        umin            v31.8h,  v23.8h,  v27.8h
+        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
+        cmge            v21.8h,  v25.8h,  v21.8h
+        cmge            v22.8h,  v26.8h,  v22.8h
+        cmge            v23.8h,  v27.8h,  v23.8h
+        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
+        cmge            v17.8h,  v29.8h,  v17.8h
+        cmge            v18.8h,  v30.8h,  v18.8h
+        cmge            v19.8h,  v31.8h,  v19.8h
+        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
+        bsl             v22.16b, v5.16b,  v4.16b
+        bsl             v21.16b, v5.16b,  v4.16b
+        bsl             v20.16b, v5.16b,  v4.16b
+        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
+        bit             v22.16b, v2.16b,  v18.16b
+        bit             v21.16b, v1.16b,  v17.16b
+        bit             v20.16b, v0.16b,  v16.16b
+        st1             {v23.8h}, [x0], #16
+        st1             {v22.8h}, [x6], #16
+        subs            w3,  w3,  #8
+        st1             {v21.8h}, [x5], #16
+        st1             {v20.8h}, [x10], #16
+        b.le            8f
+        ld1             {v5.8h},  [x8], #16
+        b               2b
+8:
+        subs            w4,  w4,  #4
+        b.le            9f
+        // End of horizontal loop, move pointers to next four rows
+        sub             x8,  x8,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        // Load the top row as early as possible
+        ld1             {v5.8h},  [x8], #16
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_paeth_tbl):
+        .hword L(ipred_paeth_tbl) - 640b
+        .hword L(ipred_paeth_tbl) - 320b
+        .hword L(ipred_paeth_tbl) - 160b
+        .hword L(ipred_paeth_tbl) -  80b
+        .hword L(ipred_paeth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int a,
+//                              const int max_width, const int max_height);
+function ipred_smooth_16bpc_neon, export=1
+        movrel          x10, X(sm_weights)
+        add             x11, x10, w4, uxtw
+        add             x10, x10, w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_tbl)
+        sub             x12, x2,  w4, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x12] // bottom
+        add             x8,  x2,  #2
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ld1r            {v6.2d}, [x8]             // top
+        ld1r            {v7.2s}, [x10]            // weights_hor
+        dup             v5.8h,   v6.h[3]          // right
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
+        zip1            v0.2d,   v3.2d,   v2.2d
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v18.8h,  v18.8b
+        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v0.8h,   v7.8h
+        smlal           v22.4s,  v1.4h,   v7.4h
+        smlal2          v23.4s,  v1.8h,   v7.8h
+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v6.8h,   v16.8h
+        smlal           v22.4s,  v6.4h,   v18.4h
+        smlal2          v23.4s,  v6.8h,   v18.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn           v21.4h,  v21.4s,  #9
+        rshrn           v22.4h,  v22.4s,  #9
+        rshrn           v23.4h,  v23.4s,  #9
+        st1             {v20.4h}, [x0], x1
+        st1             {v21.4h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.4h}, [x0], x1
+        st1             {v23.4h}, [x6], x1
+        b.gt            4b
+        ret
+80:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ld1             {v6.8h}, [x8]             // top
+        ld1             {v7.8b}, [x10]            // weights_hor
+        dup             v5.8h,   v6.h[7]          // right
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+        uxtl            v7.8h,   v7.8b            // weights_hor
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        ushll           v24.4s,  v31.4h,  #8
+        ushll           v25.4s,  v31.4h,  #8
+        ushll           v26.4s,  v31.4h,  #8
+        ushll           v27.4s,  v31.4h,  #8
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v3.8h,   v3.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+        uxtl            v18.8h,  v18.8b
+        uxtl            v19.8h,  v19.8b
+        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
+        smlal           v22.4s,  v2.4h,   v7.4h
+        smlal2          v23.4s,  v2.8h,   v7.8h
+        smlal           v24.4s,  v1.4h,   v7.4h
+        smlal2          v25.4s,  v1.8h,   v7.8h
+        smlal           v26.4s,  v0.4h,   v7.4h
+        smlal2          v27.4s,  v0.8h,   v7.8h
+        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v6.8h,   v16.8h
+        smlal           v22.4s,  v6.4h,   v17.4h
+        smlal2          v23.4s,  v6.8h,   v17.8h
+        smlal           v24.4s,  v6.4h,   v18.4h
+        smlal2          v25.4s,  v6.8h,   v18.8h
+        smlal           v26.4s,  v6.4h,   v19.4h
+        smlal2          v27.4s,  v6.8h,   v19.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn2          v20.8h,  v21.4s,  #9
+        rshrn           v21.4h,  v22.4s,  #9
+        rshrn2          v21.8h,  v23.4s,  #9
+        rshrn           v22.4h,  v24.4s,  #9
+        rshrn2          v22.8h,  v25.4s,  #9
+        rshrn           v23.4h,  v26.4s,  #9
+        rshrn2          v23.8h,  v27.4s,  #9
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        add             x12, x2,  w3, uxtw #1
+        sub             x1,  x1,  w3, uxtw #1
+        ld1r            {v5.8h}, [x12]            // right
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        mov             w9,  w3
+        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
+
+1:
+        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
+        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        uxtl            v16.8h,  v16.8b           // weights_ver
+        uxtl            v17.8h,  v17.8b
+2:
+        ld1             {v7.16b}, [x10],  #16     // weights_hor
+        ld1             {v2.8h, v3.8h}, [x8], #32 // top
+        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
+        ushll           v21.4s,  v31.4h,  #8
+        ushll           v22.4s,  v31.4h,  #8
+        ushll           v23.4s,  v31.4h,  #8
+        ushll           v24.4s,  v31.4h,  #8
+        ushll           v25.4s,  v31.4h,  #8
+        ushll           v26.4s,  v31.4h,  #8
+        ushll           v27.4s,  v31.4h,  #8
+        uxtl            v6.8h,   v7.8b            // weights_hor
+        uxtl2           v7.8h,   v7.16b
+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
+        sub             v3.8h,   v3.8h,   v4.8h
+        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
+        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
+        smlal           v22.4s,  v1.4h,   v7.4h
+        smlal2          v23.4s,  v1.8h,   v7.8h
+        smlal           v24.4s,  v0.4h,   v6.4h
+        smlal2          v25.4s,  v0.8h,   v6.8h
+        smlal           v26.4s,  v0.4h,   v7.4h
+        smlal2          v27.4s,  v0.8h,   v7.8h
+        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
+        smlal2          v21.4s,  v2.8h,   v16.8h
+        smlal           v22.4s,  v3.4h,   v16.4h
+        smlal2          v23.4s,  v3.8h,   v16.8h
+        smlal           v24.4s,  v2.4h,   v17.4h
+        smlal2          v25.4s,  v2.8h,   v17.8h
+        smlal           v26.4s,  v3.4h,   v17.4h
+        smlal2          v27.4s,  v3.8h,   v17.8h
+        rshrn           v20.4h,  v20.4s,  #9
+        rshrn2          v20.8h,  v21.4s,  #9
+        rshrn           v21.4h,  v22.4s,  #9
+        rshrn2          v21.8h,  v23.4s,  #9
+        rshrn           v22.4h,  v24.4s,  #9
+        rshrn2          v22.8h,  v25.4s,  #9
+        rshrn           v23.4h,  v26.4s,  #9
+        rshrn2          v23.8h,  v27.4s,  #9
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0], #32
+        st1             {v22.8h, v23.8h}, [x6], #32
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw #1
+        sub             x10, x10, w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_tbl):
+        .hword L(ipred_smooth_tbl) - 640b
+        .hword L(ipred_smooth_tbl) - 320b
+        .hword L(ipred_smooth_tbl) - 160b
+        .hword L(ipred_smooth_tbl) -  80b
+        .hword L(ipred_smooth_tbl) -  40b
+endfunc
+
+// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height, const int a,
+//                                const int max_width, const int max_height);
+function ipred_smooth_v_16bpc_neon, export=1
+        movrel          x7,  X(sm_weights)
+        add             x7,  x7,  w4, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_v_tbl)
+        sub             x8,  x2,  w4, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v4.8h},  [x8] // bottom
+        add             x2,  x2,  #2
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v6.2d}, [x2]             // top
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+4:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
+        zip1            v18.2s,  v18.2s,  v19.2s
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v18.8h,  v18.8b,  #7
+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v6.8h,   v18.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        st1             {v20.d}[0], [x0], x1
+        st1             {v20.d}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.d}[0], [x0], x1
+        st1             {v21.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v6.8h}, [x2]             // top
+        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
+8:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v17.8h,  v17.8b,  #7
+        ushll           v18.8h,  v18.8b,  #7
+        ushll           v19.8h,  v19.8b,  #7
+        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v6.8h,   v17.8h
+        sqrdmulh        v22.8h,  v6.8h,   v18.8h
+        sqrdmulh        v23.8h,  v6.8h,   v19.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        add             v22.8h,  v22.8h,  v4.8h
+        add             v23.8h,  v23.8h,  v4.8h
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        // Set up pointers for four rows in parallel; x0, x6, x5, x8
+        add             x5,  x0,  x1
+        add             x8,  x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
+        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
+        ushll           v17.8h,  v17.8b,  #7
+        ushll           v18.8h,  v18.8b,  #7
+        ushll           v19.8h,  v19.8b,  #7
+2:
+        ld1             {v2.8h, v3.8h}, [x2], #32 // top
+        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
+        sub             v3.8h,   v3.8h,   v4.8h
+        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
+        sqrdmulh        v21.8h,  v3.8h,   v16.8h
+        sqrdmulh        v22.8h,  v2.8h,   v17.8h
+        sqrdmulh        v23.8h,  v3.8h,   v17.8h
+        sqrdmulh        v24.8h,  v2.8h,   v18.8h
+        sqrdmulh        v25.8h,  v3.8h,   v18.8h
+        sqrdmulh        v26.8h,  v2.8h,   v19.8h
+        sqrdmulh        v27.8h,  v3.8h,   v19.8h
+        add             v20.8h,  v20.8h,  v4.8h
+        add             v21.8h,  v21.8h,  v4.8h
+        add             v22.8h,  v22.8h,  v4.8h
+        add             v23.8h,  v23.8h,  v4.8h
+        add             v24.8h,  v24.8h,  v4.8h
+        add             v25.8h,  v25.8h,  v4.8h
+        add             v26.8h,  v26.8h,  v4.8h
+        add             v27.8h,  v27.8h,  v4.8h
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0], #32
+        st1             {v22.8h, v23.8h}, [x6], #32
+        st1             {v24.8h, v25.8h}, [x5], #32
+        st1             {v26.8h, v27.8h}, [x8], #32
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x2,  x2,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x8,  x8,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_v_tbl):
+        .hword L(ipred_smooth_v_tbl) - 640b
+        .hword L(ipred_smooth_v_tbl) - 320b
+        .hword L(ipred_smooth_v_tbl) - 160b
+        .hword L(ipred_smooth_v_tbl) -  80b
+        .hword L(ipred_smooth_v_tbl) -  40b
+endfunc
+
+// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height, const int a,
+//                                const int max_width, const int max_height);
+function ipred_smooth_h_16bpc_neon, export=1
+        movrel          x8,  X(sm_weights)
+        add             x8,  x8,  w3, uxtw
+        clz             w9,  w3
+        adr             x5,  L(ipred_smooth_h_tbl)
+        add             x12, x2,  w3, uxtw #1
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x5, w9, uxtw #1]
+        ld1r            {v5.8h},  [x12] // right
+        sub             x5,  x5,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        br              x5
+40:
+        ld1r            {v7.2s}, [x8]             // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
+4:
+        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
+        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
+        zip1            v0.2d,   v3.2d,   v2.2d
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v1.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        st1             {v20.d}[0], [x0], x1
+        st1             {v20.d}[1], [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v21.d}[0], [x0], x1
+        st1             {v21.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ld1             {v7.8b}, [x8]             // weights_hor
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
+8:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
+        sub             v3.8h,   v3.8h,   v5.8h   // left-right
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v0.8h,   v0.8h,   v5.8h
+        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
+        sqrdmulh        v22.8h,  v1.8h,   v7.8h
+        sqrdmulh        v23.8h,  v0.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        add             v22.8h,  v22.8h,  v5.8h
+        add             v23.8h,  v23.8h,  v5.8h
+        st1             {v20.8h}, [x0], x1
+        st1             {v21.8h}, [x6], x1
+        subs            w4,  w4,  #4
+        st1             {v22.8h}, [x0], x1
+        st1             {v23.8h}, [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+640:
+        sub             x2,  x2,  #8
+        mov             x7,  #-8
+        // Set up pointers for four rows in parallel; x0, x6, x5, x10
+        add             x5,  x0,  x1
+        add             x10, x6,  x1
+        lsl             x1,  x1,  #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
+        sub             v0.8h,   v0.8h,   v5.8h   // left-right
+        sub             v1.8h,   v1.8h,   v5.8h
+        sub             v2.8h,   v2.8h,   v5.8h
+        sub             v3.8h,   v3.8h,   v5.8h
+2:
+        ld1             {v7.16b}, [x8],   #16     // weights_hor
+        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
+        ushll2          v7.8h,   v7.16b,  #7
+        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
+        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
+        sqrdmulh        v22.8h,  v2.8h,   v6.8h
+        sqrdmulh        v23.8h,  v2.8h,   v7.8h
+        sqrdmulh        v24.8h,  v1.8h,   v6.8h
+        sqrdmulh        v25.8h,  v1.8h,   v7.8h
+        sqrdmulh        v26.8h,  v0.8h,   v6.8h
+        sqrdmulh        v27.8h,  v0.8h,   v7.8h
+        add             v20.8h,  v20.8h,  v5.8h
+        add             v21.8h,  v21.8h,  v5.8h
+        add             v22.8h,  v22.8h,  v5.8h
+        add             v23.8h,  v23.8h,  v5.8h
+        add             v24.8h,  v24.8h,  v5.8h
+        add             v25.8h,  v25.8h,  v5.8h
+        add             v26.8h,  v26.8h,  v5.8h
+        add             v27.8h,  v27.8h,  v5.8h
+        subs            w3,  w3,  #16
+        st1             {v20.8h, v21.8h}, [x0],  #32
+        st1             {v22.8h, v23.8h}, [x6],  #32
+        st1             {v24.8h, v25.8h}, [x5],  #32
+        st1             {v26.8h, v27.8h}, [x10], #32
+        b.gt            2b
+        subs            w4,  w4,  #4
+        b.le            9f
+        sub             x8,  x8,  w9, uxtw
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x5,  x5,  x1
+        add             x10, x10, x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_smooth_h_tbl):
+        .hword L(ipred_smooth_h_tbl) - 640b
+        .hword L(ipred_smooth_h_tbl) - 320b
+        .hword L(ipred_smooth_h_tbl) - 160b
+        .hword L(ipred_smooth_h_tbl) -  80b
+        .hword L(ipred_smooth_h_tbl) -  40b
+endfunc
+
+// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                              const pixel *const topleft,
+//                              const int width, const int height, const int filt_idx,
+//                              const int max_width, const int max_height,
+//                              const int bitdepth_max);
+.macro filter_fn bpc
+function ipred_filter_\bpc\()bpc_neon
+        and             w5,  w5,  #511
+        movrel          x6,  X(filter_intra_taps)
+        lsl             w5,  w5,  #6
+        add             x6,  x6,  w5, uxtw
+        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
+        clz             w9,  w3
+        adr             x5,  L(ipred_filter\bpc\()_tbl)
+        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x5, w9, uxtw #1]
+        sxtl            v16.8h,  v16.8b
+        sxtl            v17.8h,  v17.8b
+        sub             x5,  x5,  w9, uxtw
+        sxtl            v18.8h,  v18.8b
+        sxtl            v19.8h,  v19.8b
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        sxtl            v20.8h,  v20.8b
+        sxtl            v21.8h,  v21.8b
+        sxtl            v22.8h,  v22.8b
+        dup             v31.8h,  w8
+        movi            v30.8h,  #0
+        br              x5
+40:
+        ldur            d0,  [x2, #2]             // top (0-3)
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+4:
+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
+.if \bpc == 10
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        srshr           v2.8h,   v2.8h,   #4
+        smax            v2.8h,   v2.8h,   v30.8h
+.else
+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        sqrshrun        v2.4h,   v2.4s,   #4
+        sqrshrun2       v2.8h,   v3.4s,   #4
+.endif
+        smin            v2.8h,   v2.8h,   v31.8h
+        subs            w4,  w4,  #2
+        st1             {v2.d}[0], [x0], x1
+        uxtl            v0.8h,   v2.8b
+        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
+        st1             {v2.d}[1], [x6], x1
+        b.gt            4b
+        ret
+80:
+        ldur            q0,  [x2, #2]             // top (0-7)
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+8:
+        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
+.if \bpc == 10
+        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
+        srshr           v2.8h,   v2.8h,   #4
+        smax            v2.8h,   v2.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
+        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
+        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
+        srshr           v3.8h,   v3.8h,   #4
+        smax            v3.8h,   v3.8h,   v30.8h
+.else
+        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
+        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
+        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
+        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
+        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
+        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
+        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
+        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
+        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
+        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v2.4h,   v2.4s,   #4
+        sqrshrun2       v2.8h,   v3.4s,   #4
+        smin            v2.8h,   v2.8h,   v31.8h
+        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
+        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
+        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
+        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
+        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
+        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
+        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
+        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
+        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
+        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
+        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
+        sqrshrun        v3.4h,   v4.4s,   #4
+        sqrshrun2       v3.8h,   v5.4s,   #4
+.endif
+        smin            v3.8h,   v3.8h,   v31.8h
+        subs            w4,  w4,  #2
+        st2             {v2.d, v3.d}[0], [x0], x1
+        zip2            v0.2d,   v2.2d,   v3.2d
+        st2             {v2.d, v3.d}[1], [x6], x1
+        b.gt            8b
+        ret
+160:
+320:
+        add             x8,  x2,  #2
+        sub             x2,  x2,  #4
+        mov             x7,  #-4
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+
+1:
+        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
+2:
+        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
+.if \bpc == 10
+        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
+        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
+        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
+        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
+        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
+        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
+        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
+
+        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
+        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
+        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
+        srshr           v3.8h,   v3.8h,   #4
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
+        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
+        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
+        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
+
+        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
+        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
+        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
+        srshr           v4.8h,   v4.8h,   #4
+        smax            v4.8h,   v4.8h,   v30.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
+        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
+        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
+        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
+
+        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
+        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
+        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
+        srshr           v5.8h,   v5.8h,   #4
+        smax            v5.8h,   v5.8h,   v30.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
+        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
+        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
+        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
+
+        subs            w3,  w3,  #16
+        srshr           v6.8h,   v6.8h,   #4
+        smax            v6.8h,   v6.8h,   v30.8h
+.else
+        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
+        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
+        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
+        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
+        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
+        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
+        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
+        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
+        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
+        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
+        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
+        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
+        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
+        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
+
+        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
+        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
+        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v3.4h,   v3.4s,   #4
+        sqrshrun2       v3.8h,   v4.4s,   #4
+        smin            v3.8h,   v3.8h,   v31.8h
+        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
+        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
+        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
+        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
+        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
+        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
+        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
+        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
+        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
+        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
+        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
+
+        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
+        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
+        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
+        sqrshrun        v4.4h,   v5.4s,   #4
+        sqrshrun2       v4.8h,   v6.4s,   #4
+        smin            v4.8h,   v4.8h,   v31.8h
+        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
+        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
+        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
+        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
+        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
+        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
+        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
+        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
+        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
+        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
+        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
+
+        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
+        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
+        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
+        sqrshrun        v5.4h,   v24.4s,  #4
+        sqrshrun2       v5.8h,   v25.4s,  #4
+        smin            v5.8h,   v5.8h,   v31.8h
+        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
+        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
+        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
+        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
+        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
+        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
+        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
+        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
+        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
+        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
+        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
+
+        subs            w3,  w3,  #16
+        sqrshrun        v6.4h,   v26.4s,  #4
+        sqrshrun2       v6.8h,   v27.4s,  #4
+.endif
+        smin            v6.8h,   v6.8h,   v31.8h
+
+        ins             v0.h[2], v2.h[7]
+        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
+        ins             v0.h[0], v6.h[7]
+        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
+        ins             v0.h[1], v6.h[3]
+        b.gt            2b
+        subs            w4,  w4,  #2
+        b.le            9f
+        sub             x8,  x6,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b               1b
+9:
+        ret
+
+L(ipred_filter\bpc\()_tbl):
+        .hword L(ipred_filter\bpc\()_tbl) - 320b
+        .hword L(ipred_filter\bpc\()_tbl) - 160b
+        .hword L(ipred_filter\bpc\()_tbl) -  80b
+        .hword L(ipred_filter\bpc\()_tbl) -  40b
+endfunc
+.endm
+
+filter_fn 10
+filter_fn 12
+
+function ipred_filter_16bpc_neon, export=1
+        ldr             w8,  [sp]
+        cmp             w8,  0x3ff
+        b.le            ipred_filter_10bpc_neon
+        b               ipred_filter_12bpc_neon
+endfunc
+
+// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                          const uint16_t *const pal, const uint8_t *idx,
+//                          const int w, const int h);
+function pal_pred_16bpc_neon, export=1
+        ld1             {v30.8h}, [x2]
+        clz             w9,  w4
+        adr             x6,  L(pal_pred_tbl)
+        sub             w9,  w9,  #25
+        ldrh            w9,  [x6, w9, uxtw #1]
+        movi            v31.8h,  #1, lsl #8
+        sub             x6,  x6,  w9, uxtw
+        br              x6
+40:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+4:
+        ld1             {v1.16b}, [x3], #16
+        subs            w5,  w5,  #4
+        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
+        add             v1.16b,  v1.16b,  v1.16b
+        zip1            v0.16b,  v1.16b,  v1.16b
+        zip2            v1.16b,  v1.16b,  v1.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        st1             {v0.d}[0], [x0], x1
+        tbl             v1.16b, {v30.16b}, v1.16b
+        st1             {v0.d}[1], [x2], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x2], x1
+        b.gt            4b
+        ret
+80:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+8:
+        ld1             {v2.16b, v3.16b}, [x3], #32
+        subs            w5,  w5,  #4
+        add             v2.16b,  v2.16b,  v2.16b
+        add             v3.16b,  v3.16b,  v3.16b
+        zip1            v0.16b,  v2.16b,  v2.16b
+        zip2            v1.16b,  v2.16b,  v2.16b
+        zip1            v2.16b,  v3.16b,  v3.16b
+        zip2            v3.16b,  v3.16b,  v3.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        tbl             v1.16b, {v30.16b}, v1.16b
+        st1             {v0.8h}, [x0], x1
+        tbl             v2.16b, {v30.16b}, v2.16b
+        st1             {v1.8h}, [x2], x1
+        tbl             v3.16b, {v30.16b}, v3.16b
+        st1             {v2.8h}, [x0], x1
+        st1             {v3.8h}, [x2], x1
+        b.gt            8b
+        ret
+160:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+16:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #4
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        st1             {v2.8h, v3.8h}, [x2], x1
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h}, [x0], x1
+        st1             {v6.8h, v7.8h}, [x2], x1
+        b.gt            16b
+        ret
+320:
+        add             x2,  x0,  x1
+        lsl             x1,  x1,  #1
+32:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #2
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+        b.gt            32b
+        ret
+640:
+        add             x2,  x0,  #64
+64:
+        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        subs            w5,  w5,  #1
+        add             v4.16b,  v4.16b,  v4.16b
+        add             v5.16b,  v5.16b,  v5.16b
+        add             v6.16b,  v6.16b,  v6.16b
+        add             v7.16b,  v7.16b,  v7.16b
+        zip1            v0.16b,  v4.16b,  v4.16b
+        zip2            v1.16b,  v4.16b,  v4.16b
+        zip1            v2.16b,  v5.16b,  v5.16b
+        zip2            v3.16b,  v5.16b,  v5.16b
+        zip1            v4.16b,  v6.16b,  v6.16b
+        zip2            v5.16b,  v6.16b,  v6.16b
+        zip1            v6.16b,  v7.16b,  v7.16b
+        zip2            v7.16b,  v7.16b,  v7.16b
+        add             v0.8h,   v0.8h,   v31.8h
+        add             v1.8h,   v1.8h,   v31.8h
+        add             v2.8h,   v2.8h,   v31.8h
+        add             v3.8h,   v3.8h,   v31.8h
+        add             v4.8h,   v4.8h,   v31.8h
+        tbl             v0.16b, {v30.16b}, v0.16b
+        add             v5.8h,   v5.8h,   v31.8h
+        tbl             v1.16b, {v30.16b}, v1.16b
+        add             v6.8h,   v6.8h,   v31.8h
+        tbl             v2.16b, {v30.16b}, v2.16b
+        add             v7.8h,   v7.8h,   v31.8h
+        tbl             v3.16b, {v30.16b}, v3.16b
+        tbl             v4.16b, {v30.16b}, v4.16b
+        tbl             v5.16b, {v30.16b}, v5.16b
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        tbl             v6.16b, {v30.16b}, v6.16b
+        tbl             v7.16b, {v30.16b}, v7.16b
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
+        b.gt            64b
+        ret
+
+L(pal_pred_tbl):
+        .hword L(pal_pred_tbl) - 640b
+        .hword L(pal_pred_tbl) - 320b
+        .hword L(pal_pred_tbl) - 160b
+        .hword L(pal_pred_tbl) -  80b
+        .hword L(pal_pred_tbl) -  40b
+endfunc
+
+// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha,
+//                               const int bitdepth_max);
+function ipred_cfl_128_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_128_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        urshr           v0.8h,   v31.8h,  #1
+        dup             v1.8h,   w6   // alpha
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+L(ipred_cfl_splat_w4):
+        ld1             {v4.8h, v5.8h}, [x5], #32
+        subs            w4,  w4,  #4
+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
+        sshr            v17.4s,  v3.4s,   #31
+        sshr            v18.4s,  v4.4s,   #31
+        sshr            v19.4s,  v5.4s,   #31
+        add             v2.4s,   v2.4s,   v16.4s // diff + sign
+        add             v3.4s,   v3.4s,   v17.4s
+        add             v4.4s,   v4.4s,   v18.4s
+        add             v5.4s,   v5.4s,   v19.4s
+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.d}[0],  [x0], x1
+        st1             {v2.d}[1],  [x6], x1
+        st1             {v3.d}[0],  [x0], x1
+        st1             {v3.d}[1],  [x6], x1
+        b.gt            L(ipred_cfl_splat_w4)
+        ret
+L(ipred_cfl_splat_w8):
+        ld1             {v4.8h, v5.8h}, [x5], #32
+        subs            w4,  w4,  #2
+        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v16.4s,  v2.4s,   #31    // sign = diff >> 31
+        sshr            v17.4s,  v3.4s,   #31
+        sshr            v18.4s,  v4.4s,   #31
+        sshr            v19.4s,  v5.4s,   #31
+        add             v2.4s,   v2.4s,   v16.4s // diff + sign
+        add             v3.4s,   v3.4s,   v17.4s
+        add             v4.4s,   v4.4s,   v18.4s
+        add             v5.4s,   v5.4s,   v19.4s
+        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v2.8h,   v3.4s,   #6
+        rshrn           v3.4h,   v4.4s,   #6
+        rshrn2          v3.8h,   v5.4s,   #6
+        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
+        add             v3.8h,   v3.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h},  [x0], x1
+        st1             {v3.8h},  [x6], x1
+        b.gt            L(ipred_cfl_splat_w8)
+        ret
+L(ipred_cfl_splat_w16):
+        add             x7,  x5,  w3, uxtw #1
+        sub             x1,  x1,  w3, uxtw #1
+        mov             w9,  w3
+1:
+        ld1             {v2.8h, v3.8h}, [x5], #32
+        ld1             {v4.8h, v5.8h}, [x7], #32
+        subs            w3,  w3,  #16
+        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
+        smull2          v17.4s,  v2.8h,   v1.8h
+        smull           v18.4s,  v3.4h,   v1.4h
+        smull2          v19.4s,  v3.8h,   v1.8h
+        smull           v2.4s,   v4.4h,   v1.4h
+        smull2          v3.4s,   v4.8h,   v1.8h
+        smull           v4.4s,   v5.4h,   v1.4h
+        smull2          v5.4s,   v5.8h,   v1.8h
+        sshr            v20.4s,  v16.4s,  #31    // sign = diff >> 31
+        sshr            v21.4s,  v17.4s,  #31
+        sshr            v22.4s,  v18.4s,  #31
+        sshr            v23.4s,  v19.4s,  #31
+        sshr            v24.4s,  v2.4s,   #31
+        sshr            v25.4s,  v3.4s,   #31
+        sshr            v26.4s,  v4.4s,   #31
+        sshr            v27.4s,  v5.4s,   #31
+        add             v16.4s,  v16.4s,  v20.4s // diff + sign
+        add             v17.4s,  v17.4s,  v21.4s
+        add             v18.4s,  v18.4s,  v22.4s
+        add             v19.4s,  v19.4s,  v23.4s
+        add             v2.4s,   v2.4s,   v24.4s
+        add             v3.4s,   v3.4s,   v25.4s
+        add             v4.4s,   v4.4s,   v26.4s
+        add             v5.4s,   v5.4s,   v27.4s
+        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
+        rshrn2          v16.8h,  v17.4s,  #6
+        rshrn           v17.4h,  v18.4s,  #6
+        rshrn2          v17.8h,  v19.4s,  #6
+        rshrn           v6.4h,   v2.4s,   #6
+        rshrn2          v6.8h,   v3.4s,   #6
+        rshrn           v7.4h,   v4.4s,   #6
+        rshrn2          v7.8h,   v5.4s,   #6
+        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
+        add             v3.8h,   v17.8h,  v0.8h
+        add             v4.8h,   v6.8h,   v0.8h
+        add             v5.8h,   v7.8h,   v0.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smax            v4.8h,   v4.8h,   v30.8h
+        smax            v5.8h,   v5.8h,   v30.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        st1             {v2.8h, v3.8h},  [x0], #32
+        st1             {v4.8h, v5.8h},  [x6], #32
+        b.gt            1b
+        subs            w4,  w4,  #2
+        add             x5,  x5,  w9, uxtw #1
+        add             x7,  x7,  w9, uxtw #1
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        mov             w3,  w9
+        b.gt            1b
+        ret
+
+L(ipred_cfl_128_tbl):
+L(ipred_cfl_splat_tbl):
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
+        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+endfunc
+
+// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                               const pixel *const topleft,
+//                               const int width, const int height,
+//                               const int16_t *ac, const int alpha,
+//                               const int bitdepth_max);
+function ipred_cfl_top_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        clz             w9,  w3
+        adr             x7,  L(ipred_cfl_top_tbl)
+        sub             w9,  w9,  #26
+        ldrh            w9,  [x7, w9, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        add             x2,  x2,  #2
+        sub             x7,  x7,  w9, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+4:
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+8:
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+16:
+        ld1             {v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v2.8h,   v3.8h
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+32:
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v0.4h,   v0.4s,   #5
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_top_tbl):
+        .hword L(ipred_cfl_top_tbl) - 32b
+        .hword L(ipred_cfl_top_tbl) - 16b
+        .hword L(ipred_cfl_top_tbl) -  8b
+        .hword L(ipred_cfl_top_tbl) -  4b
+endfunc
+
+// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                                const pixel *const topleft,
+//                                const int width, const int height,
+//                                const int16_t *ac, const int alpha,
+//                                const int bitdepth_max);
+function ipred_cfl_left_16bpc_neon, export=1
+        dup             v31.8h,  w7   // bitdepth_max
+        sub             x2,  x2,  w4, uxtw #1
+        clz             w9,  w3
+        clz             w8,  w4
+        adr             x10, L(ipred_cfl_splat_tbl)
+        adr             x7,  L(ipred_cfl_left_tbl)
+        sub             w9,  w9,  #26
+        sub             w8,  w8,  #26
+        ldrh            w9,  [x10, w9, uxtw #1]
+        ldrh            w8,  [x7,  w8, uxtw #1]
+        dup             v1.8h,   w6   // alpha
+        sub             x9,  x10, w9, uxtw
+        sub             x7,  x7,  w8, uxtw
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+
+L(ipred_cfl_left_h4):
+        ld1             {v0.4h},  [x2]
+        addv            h0,      v0.4h
+        urshr           v0.4h,   v0.4h,   #2
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h8):
+        ld1             {v0.8h},  [x2]
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #3
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h16):
+        ld1             {v2.8h, v3.8h}, [x2]
+        addp            v0.8h,   v2.8h,   v3.8h
+        addv            h0,      v0.8h
+        urshr           v0.4h,   v0.4h,   #4
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_h32):
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        rshrn           v0.4h,   v0.4s,   #5
+        dup             v0.8h,   v0.h[0]
+        br              x9
+
+L(ipred_cfl_left_tbl):
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
+        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+endfunc
+
+// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
+//                           const pixel *const topleft,
+//                           const int width, const int height,
+//                           const int16_t *ac, const int alpha,
+//                           const int bitdepth_max);
+function ipred_cfl_16bpc_neon, export=1
+        dup             v31.8h,  w7              // bitdepth_max
+        sub             x2,  x2,  w4, uxtw #1
+        add             w8,  w3,  w4             // width + height
+        dup             v1.8h,   w6              // alpha
+        clz             w9,  w3
+        clz             w6,  w4
+        dup             v16.4s, w8               // width + height
+        adr             x7,  L(ipred_cfl_tbl)
+        rbit            w8,  w8                  // rbit(width + height)
+        sub             w9,  w9,  #22            // 22 leading bits, minus table offset 4
+        sub             w6,  w6,  #26
+        clz             w8,  w8                  // ctz(width + height)
+        ldrh            w9,  [x7, w9, uxtw #1]
+        ldrh            w6,  [x7, w6, uxtw #1]
+        neg             w8,  w8                  // -ctz(width + height)
+        sub             x9,  x7,  w9, uxtw
+        sub             x7,  x7,  w6, uxtw
+        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
+        dup             v17.4s,  w8              // -ctz(width + height)
+        add             x6,  x0,  x1
+        lsl             x1,  x1,  #1
+        movi            v30.8h,  #0
+        br              x7
+
+L(ipred_cfl_h4):
+        ld1             {v0.4h},  [x2], #8
+        uaddlv          s0,      v0.4h
+        br              x9
+L(ipred_cfl_w4):
+        add             x2,  x2,  #2
+        ld1             {v2.4h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s2,      v2.4h
+        cmp             w4,  #4
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #16
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w4)
+
+L(ipred_cfl_h8):
+        ld1             {v0.8h},  [x2], #16
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w8):
+        add             x2,  x2,  #2
+        ld1             {v2.8h},  [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        uaddlv          s2,      v2.8h
+        cmp             w4,  #8
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/16/32
+        cmp             w4,  #32
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w8)
+
+L(ipred_cfl_h16):
+        ld1             {v2.8h, v3.8h}, [x2], #32
+        addp            v0.8h,   v2.8h,   v3.8h
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w16):
+        add             x2,  x2,  #2
+        ld1             {v2.8h, v3.8h}, [x2]
+        add             v0.2s,   v0.2s,   v16.2s
+        addp            v2.8h,   v2.8h,   v3.8h
+        uaddlv          s2,      v2.8h
+        cmp             w4,  #16
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 4/8/32
+        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_h32):
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v0.8h,   v2.8h,   v4.8h
+        uaddlv          s0,      v0.8h
+        br              x9
+L(ipred_cfl_w32):
+        add             x2,  x2,  #2
+        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
+        add             v0.4s,   v0.4s,   v16.4s
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v2.8h,   v2.8h,   v4.8h
+        cmp             w4,  #32
+        uaddlv          s2,      v2.8h
+        add             v0.2s,   v0.2s,   v2.2s
+        ushl            v0.2s,   v0.2s,   v17.2s
+        b.eq            1f
+        // h = 8/16
+        cmp             w4,  #8
+        mov             w16, #0x6667
+        mov             w17, #0xAAAB
+        csel            w16, w16, w17, eq
+        dup             v16.2s,  w16
+        mul             v0.2s,   v0.2s,   v16.2s
+        ushr            v0.2s,   v0.2s,   #17
+1:
+        dup             v0.8h,   v0.h[0]
+        b               L(ipred_cfl_splat_w16)
+
+L(ipred_cfl_tbl):
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
+        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+endfunc
+
+// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                            const ptrdiff_t stride, const int w_pad,
+//                            const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_420_16bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v24.4s,  #0
+        movi            v25.4s,  #0
+        movi            v26.4s,  #0
+        movi            v27.4s,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_420_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h}, [x0], #16
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        b.gt            1b
+        trn2            v1.2d,   v0.2d,   v0.2d
+        trn2            v0.2d,   v0.2d,   v0.2d
+L(ipred_cfl_ac_420_w4_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            2b
+3:
+L(ipred_cfl_ac_420_w4_calc_subtract_dc):
+        // Aggregate the sums
+        add             v24.4s,  v24.4s,  v25.4s
+        add             v26.4s,  v26.4s,  v27.4s
+        add             v0.4s,   v24.4s,  v26.4s
+        addv            s0,  v0.4s                // sum
+        sub             x0,  x0,  w6, uxtw #3
+        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
+        dup             v4.8h,   v4.h[0]
+6:      // Subtract dc from ac
+        ld1             {v0.8h, v1.8h}, [x0]
+        subs            w6,  w6,  #4
+        sub             v0.8h,   v0.8h,   v4.8h
+        sub             v1.8h,   v1.8h,   v4.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        b.gt            6b
+        ret
+
+L(ipred_cfl_ac_420_w8):
+        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v4.8h,   #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            1b
+        mov             v0.16b,  v1.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_420_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v2.8h
+        addp            v1.8h,   v1.8h,   v3.8h
+        add             v0.8h,   v0.8h,   v1.8h
+        shl             v0.8h,   v0.8h,   #1
+        dup             v1.4h,   v0.h[3]
+        dup             v3.4h,   v0.h[7]
+        trn2            v2.2d,   v0.2d,   v0.2d
+        subs            w8,  w8,  #2
+        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw           v25.4s,  v25.4s,  v1.4h
+        uaddw           v26.4s,  v26.4s,  v2.4h
+        uaddw           v27.4s,  v27.4s,  v3.4h
+        b.gt            1b
+        trn1            v0.2d,   v2.2d,   v3.2d
+        trn1            v1.2d,   v2.2d,   v3.2d
+
+L(ipred_cfl_ac_420_w8_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            2b
+3:
+
+        // Double the height and reuse the w4 summing/subtracting
+        lsl             w6,  w6,  #1
+        lsl             w9,  w9,  #1
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_w16):
+        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_420_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
+        add             v0.8h,   v0.8h,   v4.8h
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
+        add             v2.8h,   v2.8h,   v6.8h
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v18.8h,  v18.8h,  v19.8h
+        addp            v20.8h,  v20.8h,  v21.8h
+        addp            v22.8h,  v22.8h,  v23.8h
+        add             v16.8h,  v16.8h,  v20.8h
+        add             v18.8h,  v18.8h,  v22.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v1.8h,   v2.8h,   #1
+        shl             v2.8h,   v16.8h,  #1
+        shl             v3.8h,   v18.8h,  #1
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             q2,  [x1,  #32]
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ldr             q5,  [x10, #32]
+        ld1             {v3.8h, v4.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v2.8h
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v5.8h,   v5.8h,   v5.8h
+        addp            v3.8h,   v3.8h,   v4.8h
+        ldr             q18, [x1,  #32]
+        add             v2.4h,   v2.4h,   v5.4h
+        ld1             {v16.8h, v17.8h}, [x1],  x2
+        add             v0.8h,   v0.8h,   v3.8h
+        ldr             q21, [x10, #32]
+        ld1             {v19.8h, v20.8h}, [x10], x2
+        addp            v18.8h,  v18.8h,  v18.8h
+        addp            v16.8h,  v16.8h,  v17.8h
+        addp            v21.8h,  v21.8h,  v21.8h
+        addp            v19.8h,  v19.8h,  v20.8h
+        add             v18.4h,  v18.4h,  v21.4h
+        add             v16.8h,  v16.8h,  v19.8h
+        shl             v1.4h,   v2.4h,   #1
+        shl             v0.8h,   v0.8h,   #1
+        shl             v3.4h,   v18.4h,  #1
+        shl             v2.8h,   v16.8h,  #1
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        add             v4.8h,   v4.8h,   v6.8h
+        shl             v0.8h,   v0.8h,   #1
+        shl             v2.8h,   v4.8h,   #1
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v2.8h}, [x10], x2
+        ld1             {v4.8h}, [x1],  x2
+        ld1             {v6.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v4.8h
+        addp            v2.8h,   v2.8h,   v6.8h
+        add             v0.8h,   v0.8h,   v2.8h
+        shl             v0.8h,   v0.8h,   #1
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v0.h[7]
+        trn2            v2.2d,   v0.2d,   v3.2d
+        trn1            v0.2d,   v0.2d,   v1.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_420_w16_hpad):
+        cbz             w4,  3f
+2:      // Vertical padding (h_pad > 0)
+        subs            w4,  w4,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            2b
+3:
+
+        // Quadruple the height and reuse the w4 summing/subtracting
+        lsl             w6,  w6,  #2
+        lsl             w9,  w9,  #2
+        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+
+L(ipred_cfl_ac_420_tbl):
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
+        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
+        .hword 0
+
+L(ipred_cfl_ac_420_w16_tbl):
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
+        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+endfunc
+
+// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
+//                            const ptrdiff_t stride, const int w_pad,
+//                            const int h_pad, const int cw, const int ch);
+function ipred_cfl_ac_422_16bpc_neon, export=1
+        clz             w8,  w5
+        lsl             w4,  w4,  #2
+        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        sub             w8,  w8,  #27
+        ldrh            w8,  [x7, w8, uxtw #1]
+        movi            v24.4s,  #0
+        movi            v25.4s,  #0
+        movi            v26.4s,  #0
+        movi            v27.4s,  #0
+        sub             x7,  x7,  w8, uxtw
+        sub             w8,  w6,  w4         // height - h_pad
+        rbit            w9,  w5              // rbit(width)
+        rbit            w10, w6              // rbit(height)
+        clz             w9,  w9              // ctz(width)
+        clz             w10, w10             // ctz(height)
+        add             w9,  w9,  w10        // log2sz
+        add             x10, x1,  x2
+        dup             v31.4s,  w9
+        lsl             x2,  x2,  #1
+        neg             v31.4s,  v31.4s      // -log2sz
+        br              x7
+
+L(ipred_cfl_ac_422_w4):
+1:      // Copy and subsample input
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h}, [x0], #32
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        b.gt            1b
+        trn2            v0.2d,   v1.2d,   v1.2d
+        trn2            v1.2d,   v1.2d,   v1.2d
+        b               L(ipred_cfl_ac_420_w4_hpad)
+
+L(ipred_cfl_ac_422_w8):
+        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        ld1             {v4.8h, v5.8h}, [x1],  x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        ld1             {v6.8h, v7.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        shl             v3.8h,   v6.8h,   #2
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w8_wpad):
+1:      // Copy and subsample input, padding 4
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v1.8h}, [x10], x2
+        ld1             {v2.8h}, [x1],  x2
+        ld1             {v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v4.4h,   v0.h[3]
+        dup             v5.8h,   v0.h[7]
+        dup             v6.4h,   v2.h[3]
+        dup             v7.8h,   v2.h[7]
+        trn2            v1.2d,   v0.2d,   v5.2d
+        trn1            v0.2d,   v0.2d,   v4.2d
+        trn2            v3.2d,   v2.2d,   v7.2d
+        trn1            v2.2d,   v2.2d,   v6.2d
+        subs            w8,  w8,  #4
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v3.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w8_hpad)
+
+L(ipred_cfl_ac_422_w16):
+        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
+        ldrh            w3,  [x7, w3, uxtw #1]
+        sub             x7,  x7,  w3, uxtw
+        br              x7
+
+L(ipred_cfl_ac_422_w16_wpad0):
+1:      // Copy and subsample input, without padding
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        addp            v6.8h,   v6.8h,   v7.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v1.8h,   v2.8h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        shl             v3.8h,   v6.8h,   #2
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad1):
+1:      // Copy and subsample input, padding 4
+        ldr             q2,  [x1,  #32]
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ldr             q6,  [x10, #32]
+        ld1             {v4.8h, v5.8h}, [x10], x2
+        addp            v2.8h,   v2.8h,   v2.8h
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v6.8h,   v6.8h,   v6.8h
+        addp            v4.8h,   v4.8h,   v5.8h
+        shl             v1.4h,   v2.4h,   #2
+        shl             v0.8h,   v0.8h,   #2
+        shl             v3.4h,   v6.4h,   #2
+        shl             v2.8h,   v4.8h,   #2
+        dup             v4.4h,   v1.h[3]
+        dup             v5.4h,   v3.h[3]
+        trn1            v1.2d,   v1.2d,   v4.2d
+        trn1            v3.2d,   v3.2d,   v5.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad2):
+1:      // Copy and subsample input, padding 8
+        ld1             {v0.8h, v1.8h}, [x1],  x2
+        ld1             {v2.8h, v3.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v1.8h
+        addp            v2.8h,   v2.8h,   v3.8h
+        shl             v0.8h,   v0.8h,   #2
+        shl             v2.8h,   v2.8h,   #2
+        dup             v1.8h,   v0.h[7]
+        dup             v3.8h,   v2.h[7]
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_w16_wpad3):
+1:      // Copy and subsample input, padding 12
+        ld1             {v0.8h}, [x1],  x2
+        ld1             {v2.8h}, [x10], x2
+        addp            v0.8h,   v0.8h,   v0.8h
+        addp            v2.8h,   v2.8h,   v2.8h
+        shl             v0.4h,   v0.4h,   #2
+        shl             v2.4h,   v2.4h,   #2
+        dup             v1.8h,   v0.h[3]
+        dup             v3.8h,   v2.h[3]
+        trn1            v0.2d,   v0.2d,   v1.2d
+        trn1            v2.2d,   v2.2d,   v3.2d
+        subs            w8,  w8,  #2
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        uaddw           v24.4s,  v24.4s,  v0.4h
+        uaddw2          v25.4s,  v25.4s,  v0.8h
+        uaddw           v26.4s,  v26.4s,  v1.4h
+        uaddw2          v27.4s,  v27.4s,  v1.8h
+        uaddw           v24.4s,  v24.4s,  v2.4h
+        uaddw2          v25.4s,  v25.4s,  v2.8h
+        uaddw           v26.4s,  v26.4s,  v3.4h
+        uaddw2          v27.4s,  v27.4s,  v3.8h
+        b.gt            1b
+        mov             v0.16b,  v2.16b
+        mov             v1.16b,  v3.16b
+        b               L(ipred_cfl_ac_420_w16_hpad)
+
+L(ipred_cfl_ac_422_tbl):
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
+        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
+        .hword 0
+
+L(ipred_cfl_ac_422_w16_tbl):
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
+        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/itx.S b/ffmpeg/JNI/dav1d/src/arm/64/itx.S
index b6c0c14aa..245af0e78 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/itx.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/itx.S
@@ -58,7 +58,6 @@
 //   indicates only a quarter of input values are set, for idct16 and up,
 //   a significant amount of calculation can be skipped, at the cost of more
 //   code duplication and special casing.
-// - Special case functions for e.g. more combinations with identity.
 
 const idct_coeffs, align=4
         // idct4
@@ -106,7 +105,7 @@ const iadst8_coeffs, align=4
         .short          4076, 401, 3612, 1931
         .short          2598, 3166, 1189, 3920
         // idct_coeffs
-        .short          2896, 2896*8, 1567, 3784, 0, 0, 0, 0
+        .short          2896, 0, 1567, 3784, 0, 0, 0, 0
 endconst
 
 const iadst16_coeffs, align=4
@@ -134,13 +133,6 @@ endconst
 .endif
 .endm
 
-.macro smull_sz d0, d1, s0, c, sz
-        smull           \d0\().4s, \s0\().4h, \c
-.ifc \sz, .8h
-        smull2          \d1\().4s, \s0\().8h, \c
-.endif
-.endm
-
 .macro rshrn_sz d0, s0, s1, shift, sz
         rshrn           \d0\().4h, \s0\().4s, \shift
 .ifc \sz, .8h
@@ -457,14 +449,14 @@ endfunc
         sqsub           \r2\sz,  v3\sz,   v7\sz
 .endm
 
-function inv_dct_4x4_neon
+function inv_dct_4h_x4_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.4h}, [x16]
         idct_4          v16, v17, v18, v19, .4h
         ret
 endfunc
 
-function inv_dct_8x4_neon
+function inv_dct_8h_x4_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.4h}, [x16]
         idct_4          v16, v17, v18, v19, .8h
@@ -497,12 +489,12 @@ endfunc
         rshrn           \o3\().4h, \o3\().4s, #12
 .endm
 
-function inv_adst_4x4_neon
+function inv_adst_4h_x4_neon, export=1
         iadst_4x4       v16, v17, v18, v19
         ret
 endfunc
 
-function inv_flipadst_4x4_neon
+function inv_flipadst_4h_x4_neon, export=1
         iadst_4x4       v19, v18, v17, v16
         ret
 endfunc
@@ -563,17 +555,17 @@ endfunc
         rshrn2          \o3\().8h, v5.4s,  #12
 .endm
 
-function inv_adst_8x4_neon
+function inv_adst_8h_x4_neon, export=1
         iadst_8x4       v16, v17, v18, v19
         ret
 endfunc
 
-function inv_flipadst_8x4_neon
+function inv_flipadst_8h_x4_neon, export=1
         iadst_8x4       v19, v18, v17, v16
         ret
 endfunc
 
-function inv_identity_4x4_neon
+function inv_identity_4h_x4_neon, export=1
         mov             w16, #(5793-4096)*8
         dup             v0.4h,   w16
         sqrdmulh        v4.4h,   v16.4h,  v0.h[0]
@@ -587,7 +579,7 @@ function inv_identity_4x4_neon
         ret
 endfunc
 
-function inv_identity_8x4_neon
+function inv_identity_8h_x4_neon, export=1
         mov             w16, #(5793-4096)*8
         dup             v0.4h,   w16
         sqrdmulh        v4.8h,   v16.8h,  v0.h[0]
@@ -608,7 +600,7 @@ endfunc
 .endr
 .endm
 
-function inv_txfm_add_wht_wht_4x4_neon, export=1
+function inv_txfm_add_wht_wht_4x4_8bpc_neon, export=1
         mov             x15, x30
         movi            v31.8h,  #0
         ld1             {v16.4h,v17.4h,v18.4h,v19.4h}, [x2]
@@ -672,7 +664,7 @@ L(itx_4x4_end):
 endfunc
 
 .macro def_fn_4x4 txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
@@ -692,8 +684,8 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_neon, export=1
         b               L(itx_4x4_end)
 1:
 .endif
-        adr             x4,  inv_\txfm1\()_4x4_neon
-        adr             x5,  inv_\txfm2\()_4x4_neon
+        adr             x4,  inv_\txfm1\()_4h_x4_neon
+        adr             x5,  inv_\txfm2\()_4h_x4_neon
         b               inv_txfm_add_4x4_neon
 endfunc
 .endm
@@ -749,14 +741,14 @@ def_fn_4x4 identity, flipadst
         mov             \r6\szb, v6\szb         // out6
 .endm
 
-function inv_dct_8x8_neon
+function inv_dct_8h_x8_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.8h}, [x16]
         idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .8h, .16b
         ret
 endfunc
 
-function inv_dct_4x8_neon
+function inv_dct_4h_x8_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.8h}, [x16]
         idct_8          v16, v17, v18, v19, v20, v21, v22, v23, .4h, .8b
@@ -830,27 +822,27 @@ endfunc
         sqneg           \o5\()\sz, v3\sz     // out5
 .endm
 
-function inv_adst_8x8_neon
+function inv_adst_8h_x8_neon, export=1
         iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .8h
         ret
 endfunc
 
-function inv_flipadst_8x8_neon
+function inv_flipadst_8h_x8_neon, export=1
         iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .8h
         ret
 endfunc
 
-function inv_adst_4x8_neon
+function inv_adst_4h_x8_neon, export=1
         iadst_8         v16, v17, v18, v19, v20, v21, v22, v23, .4h
         ret
 endfunc
 
-function inv_flipadst_4x8_neon
+function inv_flipadst_4h_x8_neon, export=1
         iadst_8         v23, v22, v21, v20, v19, v18, v17, v16, .4h
         ret
 endfunc
 
-function inv_identity_8x8_neon
+function inv_identity_8h_x8_neon, export=1
         sqshl           v16.8h,  v16.8h,  #1
         sqshl           v17.8h,  v17.8h,  #1
         sqshl           v18.8h,  v18.8h,  #1
@@ -862,7 +854,7 @@ function inv_identity_8x8_neon
         ret
 endfunc
 
-function inv_identity_4x8_neon
+function inv_identity_4h_x8_neon, export=1
         sqshl           v16.4h,  v16.4h,  #1
         sqshl           v17.4h,  v17.4h,  #1
         sqshl           v18.4h,  v18.4h,  #1
@@ -913,17 +905,17 @@ def_fn_8x8_base
 def_fn_8x8_base identity_
 
 .macro def_fn_8x8 txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         8,   8,   1
 .endif
-        adr             x5,  inv_\txfm2\()_8x8_neon
+        adr             x5,  inv_\txfm2\()_8h_x8_neon
 .ifc \txfm1, identity
         b               inv_txfm_identity_add_8x8_neon
 .else
-        adr             x4,  inv_\txfm1\()_8x8_neon
+        adr             x4,  inv_\txfm1\()_8h_x8_neon
         b               inv_txfm_add_8x8_neon
 .endif
 endfunc
@@ -1000,14 +992,14 @@ function inv_txfm_add_4x8_neon
 endfunc
 
 .macro def_fn_48 w, h, txfm1, txfm2
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
         mov             x15, x30
 
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  0
 .endif
-        adr             x4,  inv_\txfm1\()_\h\()x\w\()_neon
-        adr             x5,  inv_\txfm2\()_\w\()x\h\()_neon
+        adr             x4,  inv_\txfm1\()_\h\()h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_\w\()h_x\h\()_neon
         b               inv_txfm_add_\w\()x\h\()_neon
 endfunc
 .endm
@@ -1118,14 +1110,14 @@ def_fns_48 8, 4
         mov             v22\szb, v3\szb
 .endm
 
-function inv_dct_8x16_neon
+function inv_dct_8h_x16_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.8h, v1.8h}, [x16]
         idct_16         .8h, .16b
         ret
 endfunc
 
-function inv_dct_4x16_neon
+function inv_dct_4h_x16_neon, export=1
         movrel          x16, idct_coeffs
         ld1             {v0.8h, v1.8h}, [x16]
         idct_16         .4h, .8b
@@ -1302,27 +1294,27 @@ endfunc
         sqneg           \o9\sz,  v7\sz // out9
 .endm
 
-function inv_adst_8x16_neon
+function inv_adst_8h_x16_neon, export=1
         iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .8h, .16b
         ret
 endfunc
 
-function inv_flipadst_8x16_neon
+function inv_flipadst_8h_x16_neon, export=1
         iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .8h, .16b
         ret
 endfunc
 
-function inv_adst_4x16_neon
+function inv_adst_4h_x16_neon, export=1
         iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, .4h, .8b
         ret
 endfunc
 
-function inv_flipadst_4x16_neon
+function inv_flipadst_4h_x16_neon, export=1
         iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, .4h, .8b
         ret
 endfunc
 
-function inv_identity_8x16_neon
+function inv_identity_8h_x16_neon, export=1
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1333,7 +1325,7 @@ function inv_identity_8x16_neon
         ret
 endfunc
 
-function inv_identity_4x16_neon
+function inv_identity_4h_x16_neon, export=1
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1376,71 +1368,49 @@ endfunc
 .endr
 .endm
 
-function inv_txfm_horz_16x8_neon
+.macro def_horz_16 scale=0, identity=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x8_neon
         mov             x14, x30
         movi            v7.8h,  #0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x7]
-        st1             {v7.8h}, [x7], x8
-.endr
-        blr             x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        srshr           v\i\().8h,  v\i\().8h,  #2
-.endr
-        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
-        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
-
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
-        st1             {v\i\().8h}, [x6], #16
-.endr
-
-        br              x14
-endfunc
-
-function inv_txfm_horz_identity_16x8_neon
-        mov             x14, x30
-        movi            v7.8h,  #0
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x7]
-        st1             {v7.8h}, [x7], x8
-.endr
+.if \identity
         mov             w16, #2*(5793-4096)*8
         dup             v0.4h,   w16
-        identity_8x16_shift2 v0.h[0]
-        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
-        transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
-
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
-        st1             {v\i\().8h}, [x6], #16
-.endr
-
-        br              x14
-endfunc
-
-function inv_txfm_horz_scale_16x8_neon
-        mov             x14, x30
-        movi            v7.8h,  #0
+.elseif \scale
         mov             w16, #2896*8
         dup             v0.4h,   w16
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x7]
+.endif
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        ld1             {\i}, [x7]
         st1             {v7.8h}, [x7], x8
 .endr
+.if \scale
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
         scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+.if \identity
+        identity_8x16_shift2 v0.h[0]
+.else
         blr             x4
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        srshr           v\i\().8h,  v\i\().8h,  #1
+.endif
+.if \shift > 0
+.irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h
+        srshr           \i,  \i,  #\shift
 .endr
+.endif
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
 
-.irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
-        st1             {v\i\().8h}, [x6], #16
+.irp i, v16.8h, v24.8h, v17.8h, v25.8h, v18.8h, v26.8h, v19.8h, v27.8h, v20.8h, v28.8h, v21.8h, v29.8h, v22.8h, v30.8h, v23.8h, v31.8h
+        st1             {\i}, [x6], #16
 .endr
 
         br              x14
 endfunc
+.endm
+
+def_horz_16 scale=0, identity=0, shift=2
+def_horz_16 scale=1, identity=0, shift=1, suffix=_scale
+def_horz_16 scale=0, identity=1, shift=0, suffix=_identity
 
 function inv_txfm_add_vert_8x16_neon
         mov             x14, x30
@@ -1487,7 +1457,7 @@ function inv_txfm_add_16x16_neon
 endfunc
 
 .macro def_fn_16x16 txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         16,  16,  2
 .endif
@@ -1495,9 +1465,9 @@ function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_neon, export=1
         adr             x9,  inv_txfm_horz_identity_16x8_neon
 .else
         adr             x9,  inv_txfm_horz_16x8_neon
-        adr             x4,  inv_\txfm1\()_8x16_neon
+        adr             x4,  inv_\txfm1\()_8h_x16_neon
 .endif
-        adr             x5,  inv_\txfm2\()_8x16_neon
+        adr             x5,  inv_\txfm2\()_8h_x16_neon
         mov             x13, #\eob_half
         b               inv_txfm_add_16x16_neon
 endfunc
@@ -1659,17 +1629,17 @@ def_fn_416_base
 def_fn_416_base identity_
 
 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  1
 .endif
 .if \w == 4
-        adr             x4,  inv_\txfm1\()_8x\w\()_neon
-        adr             x5,  inv_\txfm2\()_4x\h\()_neon
+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_4h_x\h\()_neon
         mov             w13, #\eob_half
 .else
-        adr             x4,  inv_\txfm1\()_4x\w\()_neon
-        adr             x5,  inv_\txfm2\()_8x\h\()_neon
+        adr             x4,  inv_\txfm1\()_4h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .endif
 .ifc \txfm1, identity
         b               inv_txfm_identity_add_\w\()x\h\()_neon
@@ -1842,12 +1812,12 @@ def_fn_816_base
 def_fn_816_base identity_
 
 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
-function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_neon, export=1
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_neon, export=1
 .ifc \txfm1\()_\txfm2, dct_dct
         idct_dc         \w,  \h,  1
 .endif
-        adr             x4,  inv_\txfm1\()_8x\w\()_neon
-        adr             x5,  inv_\txfm2\()_8x\h\()_neon
+        adr             x4,  inv_\txfm1\()_8h_x\w\()_neon
+        adr             x5,  inv_\txfm2\()_8h_x\h\()_neon
 .if \w == 8
         mov             x13, #\eob_half
 .endif
@@ -1881,7 +1851,7 @@ def_fn_816 \w, \h, identity, flipadst, 64
 def_fns_816 8, 16
 def_fns_816 16, 8
 
-function inv_dct32_odd_8x16_neon
+function inv_dct32_odd_8h_x16_neon, export=1
         movrel          x16, idct_coeffs, 2*16
         ld1             {v0.8h, v1.8h}, [x16]
         sub             x16, x16, #2*16
@@ -2059,7 +2029,7 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
         scale_input     .8h, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
         scale_input     .8h, v0.h[0], v24, v25, v26, v27, v28, v29, v30, v31
 .endif
-        bl              inv_dct_8x16_neon
+        bl              inv_dct_8h_x16_neon
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
         transpose_8x8h  v24, v25, v26, v27, v28, v29, v30, v31, v4, v5
 
@@ -2089,15 +2059,13 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
         scale_input     .8h, v0.h[1], v16, v17, v18, v19, v20, v21, v22, v23
         scale_input     .8h, v0.h[1], v24, v25, v26, v27, v28, v29, v30, v31
 .endif
-        bl              inv_dct32_odd_8x16_neon
+        bl              inv_dct32_odd_8h_x16_neon
         transpose_8x8h  v31, v30, v29, v28, v27, v26, v25, v24, v4, v5
         transpose_8x8h  v23, v22, v21, v20, v19, v18, v17, v16, v4, v5
 .macro store2 r0, r1, shift
-        ld1             {v4.8h}, [x6], #16
-        ld1             {v5.8h}, [x6]
+        ld1             {v4.8h, v5.8h}, [x6]
         sqsub           v7.8h,   v4.8h,   \r0
         sqsub           v6.8h,   v5.8h,   \r1
-        sub             x6,  x6,  #16
         sqadd           v4.8h,   v4.8h,   \r0
         sqadd           v5.8h,   v5.8h,   \r1
         rev64           v6.8h,   v6.8h
@@ -2106,12 +2074,10 @@ function inv_txfm_horz\suffix\()_dct_32x8_neon
         srshr           v5.8h,   v5.8h,   #\shift
         srshr           v6.8h,   v6.8h,   #\shift
         srshr           v7.8h,   v7.8h,   #\shift
-        st1             {v4.8h}, [x6], #16
         ext             v6.16b,  v6.16b,  v6.16b,  #8
-        st1             {v5.8h}, [x6], #16
+        st1             {v4.8h, v5.8h}, [x6], #32
         ext             v7.16b,  v7.16b,  v7.16b,  #8
-        st1             {v6.8h}, [x6], #16
-        st1             {v7.8h}, [x6], #16
+        st1             {v6.8h, v7.8h}, [x6], #32
 .endm
 
         store2          v31.8h,  v23.8h, \shift
@@ -2139,7 +2105,7 @@ function inv_txfm_add_vert_dct_8x32_neon
 .endr
         sub             x7,  x7,  x8, lsl #4
 
-        bl              inv_dct_8x16_neon
+        bl              inv_dct_8h_x16_neon
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         st1             {v\i\().8h}, [x7], x8
@@ -2152,7 +2118,7 @@ function inv_txfm_add_vert_dct_8x32_neon
 .endr
         sub             x7,  x7,  x8, lsl #4
         sub             x7,  x7,  x8, lsr #1
-        bl              inv_dct32_odd_8x16_neon
+        bl              inv_dct32_odd_8h_x16_neon
 
         neg             x9,  x8
         mov             x10, x6
@@ -2216,7 +2182,7 @@ const eob_8x32
         .short 43, 107, 171, 256
 endconst
 
-function inv_txfm_add_identity_identity_32x32_neon, export=1
+function inv_txfm_add_identity_identity_32x32_8bpc_neon, export=1
         movi            v0.8h,  #0
         movrel          x13, eob_32x32
 
@@ -2259,7 +2225,7 @@ endfunc
 .endm
 
 .macro def_identity_1632 w, h, wshort, hshort
-function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
         mov             w16, #2896*8
         mov             w17, #2*(5793-4096)*8
         dup             v1.4h,   w16
@@ -2285,7 +2251,7 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
 .else
         // 32x16
         shift_8_regs    sqshl, 1
-        identity_8x8 v1.h[1]
+        identity_8x8    v1.h[1]
 .endif
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
@@ -2319,12 +2285,13 @@ def_identity_1632 16, 32, _shortside,
 def_identity_1632 32, 16, , _shortside
 
 .macro def_identity_832 w, h
-function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
+function inv_txfm_add_identity_identity_\w\()x\h\()_8bpc_neon, export=1
         movi            v0.8h,  #0
         movrel          x13, eob_8x32
 
         mov             w8,  #2*\h
 1:
+        ldrh            w12, [x13], #2
 .irp i, v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
         ld1             {\i}, [x2]
         st1             {v0.8h}, [x2], x8
@@ -2337,14 +2304,13 @@ function inv_txfm_add_identity_identity_\w\()x\h\()_neon, export=1
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
 
+        cmp             w3,  w12
 .if \w == 8
         load_add_store_8x8 x0, x7, shiftbits=2
 .else
         load_add_store_8x8 x0, x7, shiftbits=3
 .endif
 
-        ldrh            w12, [x13], #2
-        cmp             w3,  w12
         b.lt            9f
 .if \w == 8
         sub             x2,  x2,  x8, lsl #3
@@ -2363,7 +2329,7 @@ endfunc
 def_identity_832 8, 32
 def_identity_832 32, 8
 
-function inv_txfm_add_dct_dct_32x32_neon, export=1
+function inv_txfm_add_dct_dct_32x32_8bpc_neon, export=1
         idct_dc         32,  32,  2
 
         mov             x15, x30
@@ -2411,14 +2377,14 @@ function inv_txfm_add_dct_dct_32x32_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_16x32_neon, export=1
+function inv_txfm_add_dct_dct_16x32_8bpc_neon, export=1
         idct_dc         16,  32,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
         movrel          x13, eob_16x32
         ldrh            w12, [x13], #2
-        adr             x4,  inv_dct_8x16_neon
+        adr             x4,  inv_dct_8h_x16_neon
 
 .irp i, 0, 8, 16, 24
         add             x6,  sp,  #(\i*16*2)
@@ -2460,13 +2426,13 @@ function inv_txfm_add_dct_dct_16x32_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_32x16_neon, export=1
+function inv_txfm_add_dct_dct_32x16_8bpc_neon, export=1
         idct_dc         32,  16,  1
 
         mov             x15, x30
         sub             sp,  sp,  #1024
 
-        adr             x5,  inv_dct_8x16_neon
+        adr             x5,  inv_dct_8h_x16_neon
 
 .irp i, 0, 8
         add             x6,  sp,  #(\i*32*2)
@@ -2505,7 +2471,7 @@ function inv_txfm_add_dct_dct_32x16_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_8x32_neon, export=1
+function inv_txfm_add_dct_dct_8x32_8bpc_neon, export=1
         idct_dc         8,   32, 2
 
         mov             x15, x30
@@ -2517,18 +2483,17 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
         mov             x8,  #2*32
         mov             w9,  #32
         mov             x6,  sp
-        mov             x7,  x2
 1:
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x7]
-        st1             {v28.8h}, [x7], x8
+        ld1             {v\i\().8h}, [x2]
+        st1             {v28.8h}, [x2], x8
 .endr
         ldrh            w12, [x13], #2
+        sub             x2,  x2,  x8, lsl #3
         sub             w9,  w9,  #8
-        sub             x7,  x7,  x8, lsl #3
-        add             x7,  x7,  #2*8
+        add             x2,  x2,  #2*8
 
-        bl              inv_dct_8x8_neon
+        bl              inv_dct_8h_x8_neon
 
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         srshr           v\i\().8h,  v\i\().8h,  #2
@@ -2536,10 +2501,9 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
 
         transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
 
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
         cmp             w3,  w12
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        st1             {v\i\().8h}, [x6], #16
-.endr
+        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x6], #64
 
         b.ge            1b
         cbz             w9,  3f
@@ -2564,7 +2528,7 @@ function inv_txfm_add_dct_dct_8x32_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_32x8_neon, export=1
+function inv_txfm_add_dct_dct_32x8_8bpc_neon, export=1
         idct_dc         32,  8,   2
 
         mov             x15, x30
@@ -2586,7 +2550,7 @@ function inv_txfm_add_dct_dct_32x8_neon, export=1
 .endr
         add             w9,  w9,  #8
 
-        bl              inv_dct_8x8_neon
+        bl              inv_dct_8h_x8_neon
 
         cmp             w9,  #32
 
@@ -2791,7 +2755,7 @@ endfunc
 .endm
 
 .macro def_dct64_func suffix, clear=0, scale=0
-function inv_txfm_dct\suffix\()_8x64_neon
+function inv_txfm_dct\suffix\()_8h_x64_neon, export=1
         mov             x14, x30
         mov             x6,  sp
         lsl             x8,  x8,  #2
@@ -2804,7 +2768,7 @@ function inv_txfm_dct\suffix\()_8x64_neon
         add             x7,  x7,  x8, lsr #1
         scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
 
-        bl              inv_dct_8x16_neon
+        bl              inv_dct_8h_x16_neon
 
         store16         x6
 
@@ -2817,7 +2781,7 @@ function inv_txfm_dct\suffix\()_8x64_neon
         sub             x7,  x7,  x8, lsr #1
         scale_if        \scale, v0.h[0], v16, v17, v18, v19, v20, v21, v22, v23
 
-        bl              inv_dct32_odd_8x16_neon
+        bl              inv_dct32_odd_8h_x16_neon
 
         add             x10, x6,  #16*15
         sub             x6,  x6,  #16*16
@@ -3040,7 +3004,11 @@ endfunc
 
 .macro sub_sp space
 #ifdef _WIN32
-.if \space > 4096
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
         sub             x16, sp,  #4096
         ldr             xzr, [x16]
         sub             sp,  x16, #(\space - 4096)
@@ -3050,16 +3018,14 @@ endfunc
 #else
 .if \space >= 4096
         sub             sp,  sp,  #(\space)/4096*4096
+.endif
 .if (\space % 4096) != 0
         sub             sp,  sp,  #(\space)%4096
 .endif
-.else
-        sub             sp,  sp,  #\space
-.endif
 #endif
 .endm
 
-function inv_txfm_add_dct_dct_64x64_neon, export=1
+function inv_txfm_add_dct_dct_64x64_8bpc_neon, export=1
         idct_dc         64,  64,  2
 
         mov             x15, x30
@@ -3079,7 +3045,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
         add             x7,  x2,  #(\i*2)
         mov             x8,  #32*2
         mov             x12, #-2 // shift
-        bl              inv_txfm_dct_clear_8x64_neon
+        bl              inv_txfm_dct_clear_8h_x64_neon
         add             x6,  x5,  #(\i*64*2)
         bl              inv_txfm_horz_dct_64x8_neon
 .if \i < 24
@@ -3104,7 +3070,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x7,  x5,  #(\i*2)
         mov             x8,  #64*2
-        bl              inv_txfm_dct_8x64_neon
+        bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
@@ -3113,7 +3079,7 @@ function inv_txfm_add_dct_dct_64x64_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_64x32_neon, export=1
+function inv_txfm_add_dct_dct_64x32_8bpc_neon, export=1
         idct_dc         64,  32,  1
 
         mov             x15, x30
@@ -3133,7 +3099,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1
         add             x7,  x2,  #(\i*2)
         mov             x8,  #32*2
         mov             x12, #-1 // shift
-        bl              inv_txfm_dct_clear_scale_8x64_neon
+        bl              inv_txfm_dct_clear_scale_8h_x64_neon
         add             x6,  x5,  #(\i*64*2)
         bl              inv_txfm_horz_dct_64x8_neon
 .if \i < 24
@@ -3166,7 +3132,7 @@ function inv_txfm_add_dct_dct_64x32_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_32x64_neon, export=1
+function inv_txfm_add_dct_dct_32x64_8bpc_neon, export=1
         idct_dc         32,  64,  1
 
         mov             x15, x30
@@ -3207,7 +3173,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1
 .irp i, 0, 8, 16, 24
         add             x7,  x5,  #(\i*2)
         mov             x8,  #32*2
-        bl              inv_txfm_dct_8x64_neon
+        bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
@@ -3216,7 +3182,7 @@ function inv_txfm_add_dct_dct_32x64_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_64x16_neon, export=1
+function inv_txfm_add_dct_dct_64x16_8bpc_neon, export=1
         idct_dc         64,  16,  2
 
         mov             x15, x30
@@ -3232,14 +3198,16 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
         mov             w8,  #(16 - \i)
         cmp             w3,  w12
         b.lt            1f
-        ldrh            w12, [x13], #2
 .endif
         add             x7,  x2,  #(\i*2)
         mov             x8,  #16*2
         mov             x12, #-2 // shift
-        bl              inv_txfm_dct_clear_8x64_neon
+        bl              inv_txfm_dct_clear_8h_x64_neon
         add             x6,  x4,  #(\i*64*2)
         bl              inv_txfm_horz_dct_64x8_neon
+.if \i < 8
+        ldrh            w12, [x13], #2
+.endif
 .endr
         b               3f
 
@@ -3256,7 +3224,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
         b.gt            2b
 
 3:
-        adr             x5,  inv_dct_8x16_neon
+        adr             x5,  inv_dct_8h_x16_neon
 .irp i, 0, 8, 16, 24, 32, 40, 48, 56
         add             x6,  x0,  #(\i)
         add             x7,  x4,  #(\i*2)
@@ -3268,7 +3236,7 @@ function inv_txfm_add_dct_dct_64x16_neon, export=1
         br              x15
 endfunc
 
-function inv_txfm_add_dct_dct_16x64_neon, export=1
+function inv_txfm_add_dct_dct_16x64_8bpc_neon, export=1
         idct_dc         16,  64,  2
 
         mov             x15, x30
@@ -3279,7 +3247,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1
         movrel          x13, eob_16x32
         ldrh            w12, [x13], #2
 
-        adr             x4,  inv_dct_8x16_neon
+        adr             x4,  inv_dct_8h_x16_neon
 .irp i, 0, 8, 16, 24
         add             x6,  x5,  #(\i*16*2)
 .if \i > 0
@@ -3310,7 +3278,7 @@ function inv_txfm_add_dct_dct_16x64_neon, export=1
 .irp i, 0, 8
         add             x7,  x5,  #(\i*2)
         mov             x8,  #16*2
-        bl              inv_txfm_dct_8x64_neon
+        bl              inv_txfm_dct_8h_x64_neon
         add             x6,  x0,  #(\i)
         bl              inv_txfm_add_vert_dct_8x64_neon
 .endr
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/itx16.S b/ffmpeg/JNI/dav1d/src/arm/64/itx16.S
new file mode 100644
index 000000000..266f57e36
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/src/arm/64/itx16.S
@@ -0,0 +1,3526 @@
+/******************************************************************************
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// The exported functions in this file have got the following signature:
+// void itxfm_add(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob,
+//                int bitdepth_max);
+
+// Most of the functions use the following register layout:
+// x0-x3  external parameters
+// x4     function pointer to first transform
+// x5     function pointer to second transform
+// x6     output parameter for helper function
+// x7     input parameter for helper function
+// x8     input stride for helper function
+// x9-x12 scratch variables for helper functions
+// x13    pointer to list of eob thresholds
+// x14    return pointer for helper function
+// x15    return pointer for main function
+
+// The SIMD registers most often use the following layout:
+// v0-v1   multiplication coefficients
+// v2-v7   scratch registers
+// v8-v15  unused
+// v16-v31 inputs/outputs of transforms
+
+const idct_coeffs, align=4
+        // idct4
+        .int            2896, 2896*8*(1<<16), 1567, 3784
+        // idct8
+        .int            799, 4017, 3406, 2276
+        // idct16
+        .int            401, 4076, 3166, 2598
+        .int            1931, 3612, 3920, 1189
+        // idct32
+        .int            201, 4091, 3035, 2751
+        .int            1751, 3703, 3857, 1380
+        .int            995, 3973, 3513, 2106
+        .int            2440, 3290, 4052, 601
+endconst
+
+const idct64_coeffs, align=4
+        .int            101*8*(1<<16), 4095*8*(1<<16), 2967*8*(1<<16), -2824*8*(1<<16)
+        .int            1660*8*(1<<16), 3745*8*(1<<16), 3822*8*(1<<16), -1474*8*(1<<16)
+        .int            4076, 401, 4017, 799
+
+        .int            4036*8*(1<<16), -700*8*(1<<16), 2359*8*(1<<16), 3349*8*(1<<16)
+        .int            3461*8*(1<<16), -2191*8*(1<<16), 897*8*(1<<16), 3996*8*(1<<16)
+        .int            -3166, -2598, -799, -4017
+
+        .int            501*8*(1<<16), 4065*8*(1<<16), 3229*8*(1<<16), -2520*8*(1<<16)
+        .int            2019*8*(1<<16), 3564*8*(1<<16), 3948*8*(1<<16), -1092*8*(1<<16)
+        .int            3612, 1931, 2276, 3406
+
+        .int            4085*8*(1<<16), -301*8*(1<<16), 2675*8*(1<<16), 3102*8*(1<<16)
+        .int            3659*8*(1<<16), -1842*8*(1<<16), 1285*8*(1<<16), 3889*8*(1<<16)
+        .int            -3920, -1189, -3406, -2276
+endconst
+
+const iadst4_coeffs, align=4
+        .int            1321, 3803, 2482, 3344
+endconst
+
+const iadst8_coeffs, align=4
+        .int            4076, 401, 3612, 1931
+        .int            2598, 3166, 1189, 3920
+        // idct_coeffs
+        .int            2896, 0, 1567, 3784
+endconst
+
+const iadst16_coeffs, align=4
+        .int            4091, 201, 3973, 995
+        .int            3703, 1751, 3290, 2440
+        .int            2751, 3035, 2106, 3513
+        .int            1380, 3857, 601, 4052
+endconst
+
+.macro mul_mla d, s0, s1, c0, c1
+        mul             \d\().4s, \s0\().4s, \c0
+        mla             \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro mul_mls d, s0, s1, c0, c1
+        mul             \d\().4s, \s0\().4s, \c0
+        mls             \d\().4s, \s1\().4s, \c1
+.endm
+
+.macro scale_input sz, c, r0, r1, r2 r3, r4, r5, r6, r7
+        sqrdmulh        \r0\sz,  \r0\sz,  \c
+        sqrdmulh        \r1\sz,  \r1\sz,  \c
+        sqrdmulh        \r2\sz,  \r2\sz,  \c
+        sqrdmulh        \r3\sz,  \r3\sz,  \c
+.ifnb \r4
+        sqrdmulh        \r4\sz,  \r4\sz,  \c
+        sqrdmulh        \r5\sz,  \r5\sz,  \c
+        sqrdmulh        \r6\sz,  \r6\sz,  \c
+        sqrdmulh        \r7\sz,  \r7\sz,  \c
+.endif
+.endm
+
+.macro load_add_store load, shift, addsrc, adddst, max, min, store, dst, src, shiftbits=4
+.ifnb \load
+        ld1             {\load},  [\src], x1
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #\shiftbits
+.endif
+.ifnb \addsrc
+        sqadd           \adddst, \adddst, \addsrc
+.endif
+.ifnb \max
+        smax            \max,  \max,  v6.8h
+.endif
+.ifnb \min
+        smin            \min,  \min,  v7.8h
+.endif
+.ifnb \store
+        st1             {\store},  [\dst], x1
+.endif
+.endm
+.macro load_add_store_8x16 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src
+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src
+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src
+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src
+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src
+        load_add_store  v2.8h, v24.8h, v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src
+        load_add_store  v3.8h, v25.8h, v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src
+        load_add_store  v4.8h, v26.8h, v2.8h, v24.8h, v23.8h, v22.8h, v21.8h, \dst, \src
+        load_add_store  v5.8h, v27.8h, v3.8h, v25.8h, v24.8h, v23.8h, v22.8h, \dst, \src
+        load_add_store  v2.8h, v28.8h, v4.8h, v26.8h, v25.8h, v24.8h, v23.8h, \dst, \src
+        load_add_store  v3.8h, v29.8h, v5.8h, v27.8h, v26.8h, v25.8h, v24.8h, \dst, \src
+        load_add_store  v4.8h, v30.8h, v2.8h, v28.8h, v27.8h, v26.8h, v25.8h, \dst, \src
+        load_add_store  v5.8h, v31.8h, v3.8h, v29.8h, v28.8h, v27.8h, v26.8h, \dst, \src
+        load_add_store       ,       , v4.8h, v30.8h, v29.8h, v28.8h, v27.8h, \dst, \src
+        load_add_store       ,       , v5.8h, v31.8h, v30.8h, v29.8h, v28.8h, \dst, \src
+        load_add_store       ,       ,      ,       , v31.8h, v30.8h, v29.8h, \dst, \src
+        load_add_store       ,       ,      ,       ,       , v31.8h, v30.8h, \dst, \src
+        load_add_store       ,       ,      ,       ,       ,       , v31.8h, \dst, \src
+.endm
+.macro load_add_store_8x8 dst, src, shiftbits=4
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
+        load_add_store  v2.8h, v20.8h, v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v21.8h, v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+        load_add_store  v4.8h, v22.8h, v2.8h, v20.8h, v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+        load_add_store  v5.8h, v23.8h, v3.8h, v21.8h, v20.8h, v19.8h, v18.8h, \dst, \src, \shiftbits
+        load_add_store       ,       , v4.8h, v22.8h, v21.8h, v20.8h, v19.8h, \dst, \src, \shiftbits
+        load_add_store       ,       , v5.8h, v23.8h, v22.8h, v21.8h, v20.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       , v23.8h, v22.8h, v21.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       , v23.8h, v22.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       ,       , v23.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store_8x4 dst, src, shiftbits=4
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store  v2.8h, v16.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v3.8h, v17.8h,      ,       ,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v4.8h, v18.8h, v2.8h, v16.8h,       ,       ,       , \dst, \src, \shiftbits
+        load_add_store  v5.8h, v19.8h, v3.8h, v17.8h, v16.8h,       ,       , \dst, \src, \shiftbits
+        load_add_store       ,       , v4.8h, v18.8h, v17.8h, v16.8h,       , \dst, \src, \shiftbits
+        load_add_store       ,       , v5.8h, v19.8h, v18.8h, v17.8h, v16.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       , v19.8h, v18.8h, v17.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       , v19.8h, v18.8h, \dst, \src, \shiftbits
+        load_add_store       ,       ,      ,       ,       ,       , v19.8h, \dst, \src, \shiftbits
+.endm
+.macro load_add_store4 load, inssrc, insdst, shift, addsrc, adddst, max, min, store, dst, src
+.ifnb \load
+        ld1             {\load}[0],  [\src], x1
+.endif
+.ifnb \inssrc
+        ins             \insdst\().d[1],   \inssrc\().d[0]
+.endif
+.ifnb \shift
+        srshr           \shift,  \shift,  #4
+.endif
+.ifnb \load
+        ld1             {\load}[1],  [\src], x1
+.endif
+.ifnb \addsrc
+        sqadd           \adddst, \adddst, \addsrc
+.endif
+.ifnb \store
+        st1             {\store}[0],  [\dst], x1
+.endif
+.ifnb \max
+        smax            \max,  \max,  v6.8h
+.endif
+.ifnb \min
+        smin            \min,  \min,  v7.8h
+.endif
+.ifnb \store
+        st1             {\store}[1],  [\dst], x1
+.endif
+.endm
+.macro load_add_store_4x16 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
+        load_add_store4 v0.d, v25, v24, v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
+        load_add_store4 v1.d, v27, v26, v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
+        load_add_store4 v2.d, v29, v28, v24.8h, v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+        load_add_store4 v3.d, v31, v30, v26.8h, v0.8h, v24.8h, v22.8h, v20.8h, v18.d, \dst, \src
+        load_add_store4     ,    ,    , v28.8h, v1.8h, v26.8h, v24.8h, v22.8h, v20.d, \dst, \src
+        load_add_store4     ,    ,    , v30.8h, v2.8h, v28.8h, v26.8h, v24.8h, v22.d, \dst, \src
+        load_add_store4     ,    ,    ,       , v3.8h, v30.8h, v28.8h, v26.8h, v24.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v30.8h, v28.8h, v26.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       , v30.8h, v28.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v30.d, \dst, \src
+.endm
+.macro load_add_store_4x8 dst, src
+        mov             \src, \dst
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+        load_add_store4 v0.d, v17, v16,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v1.d, v19, v18,       ,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v2.d, v21, v20, v16.8h,      ,       ,       ,       ,      , \dst, \src
+        load_add_store4 v3.d, v23, v22, v18.8h, v0.8h, v16.8h,       ,       ,      , \dst, \src
+        load_add_store4     ,    ,    , v20.8h, v1.8h, v18.8h, v16.8h,       ,      , \dst, \src
+        load_add_store4     ,    ,    , v22.8h, v2.8h, v20.8h, v18.8h, v16.8h,      , \dst, \src
+        load_add_store4     ,    ,    ,       , v3.8h, v22.8h, v20.8h, v18.8h, v16.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       , v22.8h, v20.8h, v18.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       , v22.8h, v20.d, \dst, \src
+        load_add_store4     ,    ,    ,       ,      ,       ,       ,       , v22.d, \dst, \src
+.endm
+
+.macro idct_dc w, h, shift
+        cbnz            w3,  1f
+        movz            w16, #2896*8, lsl #16
+        ld1r            {v16.4s}, [x2]
+        dup             v0.2s,   w16
+        sqrdmulh        v20.4s,  v16.4s,  v0.s[0]
+        str             wzr, [x2]
+.if (\w == 2*\h) || (2*\w == \h)
+        sqrdmulh        v20.4s,  v20.4s,  v0.s[0]
+.endif
+.if \shift > 0
+        sqrshrn         v16.4h,  v20.4s,  #\shift
+        sqrshrn2        v16.8h,  v20.4s,  #\shift
+.else
+        sqxtn           v16.4h,  v20.4s
+        sqxtn2          v16.8h,  v20.4s
+.endif
+        sqrdmulh        v16.8h,  v16.8h,  v0.h[1]
+        srshr           v16.8h,  v16.8h,  #4
+        mov             w4,  #\h
+        b               idct_dc_w\w\()_neon
+1:
+.endm
+
+function idct_dc_w4_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v0.d}[1], [x0], x1
+        ld1             {v1.d}[0], [x0], x1
+        subs            w4,  w4,  #4
+        ld1             {v1.d}[1], [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #2
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        st1             {v0.d}[0], [x0], x1
+        smin            v1.8h,   v1.8h,   v31.8h
+        st1             {v0.d}[1], [x0], x1
+        st1             {v1.d}[0], [x0], x1
+        st1             {v1.d}[1], [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w8_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h}, [x0], x1
+        subs            w4,  w4,  #4
+        ld1             {v1.8h}, [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        ld1             {v2.8h}, [x0], x1
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        ld1             {v3.8h}, [x0], x1
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #2
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        st1             {v0.8h}, [x0], x1
+        smin            v2.8h,   v2.8h,   v31.8h
+        st1             {v1.8h}, [x0], x1
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h}, [x0], x1
+        st1             {v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w16_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h, v1.8h}, [x0], x1
+        subs            w4,  w4,  #2
+        ld1             {v2.8h, v3.8h}, [x0], x1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sub             x0,  x0,  x1, lsl #1
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        st1             {v0.8h, v1.8h}, [x0], x1
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v2.8h, v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w32_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
+        subs            w4,  w4,  #1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+function idct_dc_w64_neon
+        movi            v30.8h,  #0
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        sub             x1,  x1,  #64
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        subs            w4,  w4,  #1
+        sqadd           v0.8h,   v0.8h,   v16.8h
+        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0]
+        sqadd           v1.8h,   v1.8h,   v16.8h
+        sub             x0,  x0,  #64
+        sqadd           v2.8h,   v2.8h,   v16.8h
+        sqadd           v3.8h,   v3.8h,   v16.8h
+        sqadd           v4.8h,   v4.8h,   v16.8h
+        sqadd           v5.8h,   v5.8h,   v16.8h
+        sqadd           v6.8h,   v6.8h,   v16.8h
+        sqadd           v7.8h,   v7.8h,   v16.8h
+        smax            v0.8h,   v0.8h,   v30.8h
+        smax            v1.8h,   v1.8h,   v30.8h
+        smax            v2.8h,   v2.8h,   v30.8h
+        smax            v3.8h,   v3.8h,   v30.8h
+        smax            v4.8h,   v4.8h,   v30.8h
+        smax            v5.8h,   v5.8h,   v30.8h
+        smax            v6.8h,   v6.8h,   v30.8h
+        smax            v7.8h,   v7.8h,   v30.8h
+        smin            v0.8h,   v0.8h,   v31.8h
+        smin            v1.8h,   v1.8h,   v31.8h
+        smin            v2.8h,   v2.8h,   v31.8h
+        smin            v3.8h,   v3.8h,   v31.8h
+        smin            v4.8h,   v4.8h,   v31.8h
+        smin            v5.8h,   v5.8h,   v31.8h
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
+        smin            v6.8h,   v6.8h,   v31.8h
+        smin            v7.8h,   v7.8h,   v31.8h
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
+        b.gt            1b
+        ret
+endfunc
+
+.macro iwht4
+        add             v16.4s,  v16.4s,  v17.4s
+        sub             v21.4s,  v18.4s,  v19.4s
+        sub             v20.4s,  v16.4s,  v21.4s
+        sshr            v20.4s,  v20.4s,  #1
+        sub             v18.4s,  v20.4s,  v17.4s
+        sub             v17.4s,  v20.4s,  v19.4s
+        add             v19.4s,  v21.4s,  v18.4s
+        sub             v16.4s,  v16.4s,  v17.4s
+.endm
+
+.macro idct_4 r0, r1, r2, r3
+        mul_mla         v6,  \r1, \r3, v0.s[3], v0.s[2]
+        mul_mls         v4,  \r1, \r3, v0.s[2], v0.s[3]
+        mul_mla         v2,  \r0, \r2, v0.s[0], v0.s[0]
+        mul_mls         v3,  \r0, \r2, v0.s[0], v0.s[0]
+        srshr           v6.4s,  v6.4s,  #12
+        srshr           v7.4s,  v4.4s,  #12
+        srshr           v2.4s,  v2.4s,  #12
+        srshr           v3.4s,  v3.4s,  #12
+        sqadd           \r0\().4s,  v2.4s,   v6.4s
+        sqsub           \r3\().4s,  v2.4s,   v6.4s
+        sqadd           \r1\().4s,  v3.4s,   v7.4s
+        sqsub           \r2\().4s,  v3.4s,   v7.4s
+.endm
+
+function inv_dct_4s_x4_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s}, [x16]
+        idct_4          v16, v17, v18, v19
+        ret
+endfunc
+
+.macro iadst_4x4 o0, o1, o2, o3
+        movrel          x16, iadst4_coeffs
+        ld1             {v0.4s}, [x16]
+
+        sub             v3.4s,   v16.4s,  v18.4s
+        mul             v4.4s,   v16.4s,  v0.s[0]
+        mla             v4.4s,   v18.4s,  v0.s[1]
+        mla             v4.4s,   v19.4s,  v0.s[2]
+        mul             v7.4s,   v17.4s,  v0.s[3]
+        add             v3.4s,   v3.4s,   v19.4s
+        mul             v5.4s,   v16.4s,  v0.s[2]
+        mls             v5.4s,   v18.4s,  v0.s[0]
+        mls             v5.4s,   v19.4s,  v0.s[1]
+
+        add             \o3\().4s, v4.4s,     v5.4s
+        mul             \o2\().4s, v3.4s,     v0.s[3]
+        add             \o0\().4s, v4.4s,     v7.4s
+        add             \o1\().4s, v5.4s,     v7.4s
+        sub             \o3\().4s, \o3\().4s, v7.4s
+
+        srshr           \o0\().4s, \o0\().4s, #12
+        srshr           \o2\().4s, \o2\().4s, #12
+        srshr           \o1\().4s, \o1\().4s, #12
+        srshr           \o3\().4s, \o3\().4s, #12
+.endm
+
+function inv_adst_4s_x4_neon
+        iadst_4x4       v16, v17, v18, v19
+        ret
+endfunc
+
+function inv_flipadst_4s_x4_neon
+        iadst_4x4       v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x4_neon
+        movz            w16, #(5793-4096)*8, lsl #16
+        dup             v0.2s,   w16
+        sqrdmulh        v4.4s,   v16.4s,  v0.s[0]
+        sqrdmulh        v5.4s,   v17.4s,  v0.s[0]
+        sqrdmulh        v6.4s,   v18.4s,  v0.s[0]
+        sqrdmulh        v7.4s,   v19.4s,  v0.s[0]
+        sqadd           v16.4s,  v16.4s,  v4.4s
+        sqadd           v17.4s,  v17.4s,  v5.4s
+        sqadd           v18.4s,  v18.4s,  v6.4s
+        sqadd           v19.4s,  v19.4s,  v7.4s
+        ret
+endfunc
+
+function inv_txfm_add_wht_wht_4x4_16bpc_neon, export=1
+        mov             x15, x30
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v30.4s, v31.4s}, [x2], #32
+
+        sshr            v16.4s,  v16.4s,  #2
+        sshr            v17.4s,  v17.4s,  #2
+        sshr            v18.4s,  v18.4s,  #2
+        sshr            v19.4s,  v19.4s,  #2
+
+        iwht4
+
+        st1             {v30.4s, v31.4s}, [x2], #32
+        transpose_4x4s  v16, v17, v18, v19, v20, v21, v22, v23
+
+        iwht4
+
+        ld1             {v0.d}[0], [x0], x1
+        sqxtn           v16.4h,  v16.4s
+        ld1             {v0.d}[1], [x0], x1
+        sqxtn2          v16.8h,  v17.4s
+        ld1             {v1.d}[0], [x0], x1
+        sqxtn           v18.4h,  v18.4s
+        ld1             {v1.d}[1], [x0], x1
+        sqxtn2          v18.8h,  v19.4s
+
+        b               L(itx_4x4_end)
+endfunc
+
+function inv_txfm_add_4x4_neon
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v30.4s, v31.4s}, [x2], #32
+
+        blr             x4
+
+        st1             {v30.4s, v31.4s}, [x2], #32
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        transpose_4x4h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x5
+
+        ld1             {v0.d}[0], [x0], x1
+        ld1             {v0.d}[1], [x0], x1
+        ins             v16.d[1], v17.d[0]
+        ins             v18.d[1], v19.d[0]
+        ld1             {v1.d}[0], [x0], x1
+        ld1             {v1.d}[1], [x0], x1
+        srshr           v16.8h,  v16.8h,  #4
+        srshr           v18.8h,  v18.8h,  #4
+
+L(itx_4x4_end):
+        mvni            v31.8h,  #0xfc, lsl #8 // 0x3ff
+        sub             x0,  x0,  x1, lsl #2
+        sqadd           v16.8h,  v16.8h,  v0.8h
+        sqadd           v18.8h,  v18.8h,  v1.8h
+        smax            v16.8h,  v16.8h,  v30.8h
+        smax            v18.8h,  v18.8h,  v30.8h
+        smin            v16.8h,  v16.8h,  v31.8h
+        st1             {v16.d}[0], [x0], x1
+        smin            v18.8h,  v18.8h,  v31.8h
+        st1             {v16.d}[1], [x0], x1
+        st1             {v18.d}[0], [x0], x1
+        st1             {v18.d}[1], [x0], x1
+
+        br              x15
+endfunc
+
+.macro def_fn_4x4 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        cbnz            w3,  1f
+        movz            w16, #2896*8, lsl #16
+        ld1r            {v16.4s}, [x2]
+        dup             v4.2s,   w16
+        str             wzr, [x2]
+        sqrdmulh        v16.4s,  v16.4s,  v4.s[0]
+        ld1             {v0.d}[0], [x0], x1
+        sqxtn           v20.4h,  v16.4s
+        sqxtn2          v20.8h,  v16.4s
+        ld1             {v0.d}[1], [x0], x1
+        sqrdmulh        v20.8h,  v20.8h,  v4.h[1]
+        ld1             {v1.d}[0], [x0], x1
+        srshr           v16.8h,  v20.8h,  #4
+        ld1             {v1.d}[1], [x0], x1
+        srshr           v18.8h,  v20.8h,  #4
+        movi            v30.8h,  #0
+        b               L(itx_4x4_end)
+1:
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x4_neon
+        movrel          x5,  X(inv_\txfm2\()_4h_x4_neon)
+        b               inv_txfm_add_4x4_neon
+endfunc
+.endm
+
+def_fn_4x4 dct, dct
+def_fn_4x4 identity, identity
+def_fn_4x4 dct, adst
+def_fn_4x4 dct, flipadst
+def_fn_4x4 dct, identity
+def_fn_4x4 adst, dct
+def_fn_4x4 adst, adst
+def_fn_4x4 adst, flipadst
+def_fn_4x4 flipadst, dct
+def_fn_4x4 flipadst, adst
+def_fn_4x4 flipadst, flipadst
+def_fn_4x4 identity, dct
+
+def_fn_4x4 adst, identity
+def_fn_4x4 flipadst, identity
+def_fn_4x4 identity, adst
+def_fn_4x4 identity, flipadst
+
+.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
+        idct_4          \r0, \r2, \r4, \r6
+
+        mul_mls         v2,  \r1, \r7, v1.s[0], v1.s[1]  // -> t4a
+        mul_mla         v4,  \r1, \r7, v1.s[1], v1.s[0]  // -> t7a
+        mul_mls         v6,  \r5, \r3, v1.s[2], v1.s[3]  // -> t5a
+        mul_mla         v7,  \r5, \r3, v1.s[3], v1.s[2]  // -> t6a
+        srshr           \r1\().4s, v2.4s,  #12           // t4a
+        srshr           \r7\().4s, v4.4s,  #12           // t7a
+        srshr           \r3\().4s, v6.4s,  #12           // t5a
+        srshr           \r5\().4s, v7.4s,  #12           // taa
+
+        sqadd           v2.4s,     \r1\().4s,  \r3\().4s // t4
+        sqsub           \r1\().4s, \r1\().4s,  \r3\().4s // t5a
+        sqadd           v3.4s,     \r7\().4s,  \r5\().4s // t7
+        sqsub           \r3\().4s, \r7\().4s,  \r5\().4s // t6a
+
+        mul_mls         v4,  \r3, \r1, v0.s[0], v0.s[0]  // -> t5
+        mul_mla         v6,  \r3, \r1, v0.s[0], v0.s[0]  // -> t6
+        srshr           v4.4s,  v4.4s,  #12              // t5
+        srshr           v5.4s,  v6.4s,  #12              // t6
+
+        sqsub           \r7\().4s,  \r0\().4s,  v3.4s    // out7
+        sqadd           \r0\().4s,  \r0\().4s,  v3.4s    // out0
+        sqadd           \r1\().4s,  \r2\().4s,  v5.4s    // out1
+        sqsub           v6.4s,      \r2\().4s,  v5.4s    // out6
+        sqadd           \r2\().4s,  \r4\().4s,  v4.4s    // out2
+        sqsub           \r5\().4s,  \r4\().4s,  v4.4s    // out5
+        sqadd           \r3\().4s,  \r6\().4s,  v2.4s    // out3
+        sqsub           \r4\().4s,  \r6\().4s,  v2.4s    // out4
+        mov             \r6\().16b, v6.16b               // out6
+.endm
+
+function inv_dct_4s_x8_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s, v1.4s}, [x16]
+        idct_8          v16, v17, v18, v19, v20, v21, v22, v23
+        ret
+endfunc
+
+.macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
+        movrel          x16, iadst8_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mla         v2,  v23, v16, v0.s[0], v0.s[1]
+        mul_mls         v4,  v23, v16, v0.s[1], v0.s[0]
+        mul_mla         v6,  v21, v18, v0.s[2], v0.s[3]
+        srshr           v16.4s, v2.4s,  #12  // t0a
+        srshr           v23.4s, v4.4s,  #12  // t1a
+        mul_mls         v2,  v21, v18, v0.s[3], v0.s[2]
+        mul_mla         v4,  v19, v20, v1.s[0], v1.s[1]
+        srshr           v18.4s, v6.4s,  #12  // t2a
+        srshr           v21.4s, v2.4s,  #12  // t3a
+        mul_mls         v6,  v19, v20, v1.s[1], v1.s[0]
+        mul_mla         v2,  v17, v22, v1.s[2], v1.s[3]
+        srshr           v20.4s, v4.4s,  #12  // t4a
+        srshr           v19.4s, v6.4s,  #12  // t5a
+        mul_mls         v4,  v17, v22, v1.s[3], v1.s[2]
+        srshr           v22.4s, v2.4s,  #12  // t6a
+        srshr           v17.4s, v4.4s,  #12  // t7a
+
+        ld1             {v0.4s}, [x16]
+
+        sqadd           v2.4s,   v16.4s,  v20.4s // t0
+        sqsub           v3.4s,   v16.4s,  v20.4s // t4
+        sqadd           v4.4s,   v23.4s,  v19.4s // t1
+        sqsub           v5.4s,   v23.4s,  v19.4s // t5
+        sqadd           v6.4s,   v18.4s,  v22.4s // t2
+        sqsub           v7.4s,   v18.4s,  v22.4s // t6
+        sqadd           v18.4s,  v21.4s,  v17.4s // t3
+        sqsub           v19.4s,  v21.4s,  v17.4s // t7
+
+        mul_mla         v16, v3,  v5,  v0.s[3], v0.s[2]
+        mul_mls         v20, v3,  v5,  v0.s[2], v0.s[3]
+        mul_mls         v22, v19, v7,  v0.s[3], v0.s[2]
+
+        srshr           v3.4s,  v16.4s, #12  // t4a
+        srshr           v5.4s,  v20.4s, #12  // t5a
+
+        mul_mla         v16, v19, v7,  v0.s[2], v0.s[3]
+
+        srshr           v7.4s,  v22.4s, #12  // t6a
+        srshr           v19.4s, v16.4s, #12  // t7a
+
+        sqadd           \o0\().4s, v2.4s, v6.4s  // out0
+        sqsub           v2.4s,     v2.4s, v6.4s  // t2
+        sqadd           \o7\().4s, v4.4s, v18.4s // out7
+        sqsub           v4.4s,     v4.4s, v18.4s // t3
+        sqneg           \o7\().4s, \o7\().4s     // out7
+
+        sqadd           \o1\().4s, v3.4s, v7.4s  // out1
+        sqsub           v3.4s,     v3.4s, v7.4s  // t6
+        sqadd           \o6\().4s, v5.4s, v19.4s // out6
+        sqsub           v5.4s,     v5.4s, v19.4s // t7
+        sqneg           \o1\().4s, \o1\().4s     // out1
+
+        mul_mla         v18, v2,  v4,  v0.s[0], v0.s[0] // -> out3 (v19 or v20)
+        mul_mls         v6,  v2,  v4,  v0.s[0], v0.s[0] // -> out4 (v20 or v19)
+        mul_mls         v20, v3,  v5,  v0.s[0], v0.s[0] // -> out5 (v21 or v18)
+        srshr           v2.4s,  v18.4s, #12 // out3
+        mul_mla         v18, v3,  v5,  v0.s[0], v0.s[0] // -> out2 (v18 or v21)
+        srshr           v3.4s,  v20.4s, #12 // out5
+        srshr           \o2\().4s, v18.4s, #12 // out2 (v18 or v21)
+        srshr           \o4\().4s, v6.4s,  #12 // out4 (v20 or v19)
+
+        sqneg           \o3\().4s, v2.4s     // out3
+        sqneg           \o5\().4s, v3.4s     // out5
+.endm
+
+function inv_adst_4s_x8_neon
+        iadst_8         v16, v17, v18, v19, v20, v21, v22, v23
+        ret
+endfunc
+
+function inv_flipadst_4s_x8_neon
+        iadst_8         v23, v22, v21, v20, v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x8_neon
+        sqshl           v16.4s,  v16.4s,  #1
+        sqshl           v17.4s,  v17.4s,  #1
+        sqshl           v18.4s,  v18.4s,  #1
+        sqshl           v19.4s,  v19.4s,  #1
+        sqshl           v20.4s,  v20.4s,  #1
+        sqshl           v21.4s,  v21.4s,  #1
+        sqshl           v22.4s,  v22.4s,  #1
+        sqshl           v23.4s,  v23.4s,  #1
+        ret
+endfunc
+
+function inv_txfm_add_8x8_neon
+        movi            v31.4s,  #0
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},     [x6]
+        st1             {v31.4s}, [x6], x11
+.endr
+
+        blr             x4
+
+        sqrshrn         v24.4h,  v16.4s,  #1
+        sqrshrn         v25.4h,  v17.4s,  #1
+        sqrshrn         v26.4h,  v18.4s,  #1
+        sqrshrn         v27.4h,  v19.4s,  #1
+        sqrshrn2        v24.8h,  v20.4s,  #1
+        sqrshrn2        v25.8h,  v21.4s,  #1
+        sqrshrn2        v26.8h,  v22.4s,  #1
+        sqrshrn2        v27.8h,  v23.4s,  #1
+
+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        movi            \i,  #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},     [x2]
+        st1             {v31.4s}, [x2], x11
+.endr
+
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+
+        transpose_4x8h  v16, v17, v18, v19, v20, v21, v22, v23
+
+        mov             v20.16b, v24.16b
+        mov             v21.16b, v25.16b
+        mov             v22.16b, v26.16b
+        mov             v23.16b, v27.16b
+
+        blr             x5
+
+        load_add_store_8x8 x0, x7
+        br              x15
+endfunc
+
+.macro def_fn_8x8 txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         8,   8,   1
+.endif
+        movrel          x5,  X(inv_\txfm2\()_8h_x8_neon)
+        mov             w13, #\eob_half
+        adr             x4,  inv_\txfm1\()_4s_x8_neon
+        b               inv_txfm_add_8x8_neon
+endfunc
+.endm
+
+def_fn_8x8 dct, dct, 10
+def_fn_8x8 identity, identity, 10
+def_fn_8x8 dct, adst, 10
+def_fn_8x8 dct, flipadst, 10
+def_fn_8x8 dct, identity, 4
+def_fn_8x8 adst, dct, 10
+def_fn_8x8 adst, adst, 10
+def_fn_8x8 adst, flipadst, 10
+def_fn_8x8 flipadst, dct, 10
+def_fn_8x8 flipadst, adst, 10
+def_fn_8x8 flipadst, flipadst, 10
+def_fn_8x8 identity, dct, 4
+def_fn_8x8 adst, identity, 4
+def_fn_8x8 flipadst, identity, 4
+def_fn_8x8 identity, adst, 4
+def_fn_8x8 identity, flipadst, 4
+
+function inv_txfm_add_8x4_neon
+        movi            v28.4s,  #0
+        movi            v29.4s,  #0
+        movi            v30.4s,  #0
+        movi            v31.4s,  #0
+        ld1             {v16.4s,v17.4s,v18.4s,v19.4s}, [x2]
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2], #64
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+        ld1             {v20.4s,v21.4s,v22.4s,v23.4s}, [x2]
+        st1             {v28.4s,v29.4s,v30.4s,v31.4s}, [x2]
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        blr             x4
+
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        sqxtn           v20.4h,  v20.4s
+        sqxtn           v21.4h,  v21.4s
+        sqxtn           v22.4h,  v22.4s
+        sqxtn           v23.4h,  v23.4s
+
+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+        ins             v16.d[1], v20.d[0]
+        ins             v17.d[1], v21.d[0]
+        ins             v18.d[1], v22.d[0]
+        ins             v19.d[1], v23.d[0]
+
+        blr             x5
+
+        load_add_store_8x4 x0, x7
+        br              x15
+endfunc
+
+function inv_txfm_add_4x8_neon
+        movz            w16, #2896*8, lsl #16
+        movi            v31.4s,  #0
+        dup             v30.2s,  w16
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},     [x6]
+        st1             {v31.4s}, [x6], x11
+.endr
+        scale_input     .4s, v30.s[0], v16, v17, v18, v19
+        blr             x4
+        sqxtn           v20.4h,  v16.4s
+        sqxtn           v21.4h,  v17.4s
+        sqxtn           v22.4h,  v18.4s
+        sqxtn           v23.4h,  v19.4s
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+        b               2f
+
+1:
+.irp i, v20, v21, v22, v23
+        movi            \i\().4h, #0
+.endr
+
+2:
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},     [x2]
+        st1             {v31.4s}, [x2], x11
+.endr
+        scale_input     .4s, v30.s[0], v16, v17, v18, v19
+        blr             x4
+        sqxtn           v16.4h,  v16.4s
+        sqxtn           v17.4h,  v17.4s
+        sqxtn           v18.4h,  v18.4s
+        sqxtn           v19.4h,  v19.4s
+        transpose_4x4h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+
+        blr             x5
+
+        load_add_store_4x8 x0, x7
+        br              x15
+endfunc
+
+.macro def_fn_48 w, h, txfm1, txfm2, eob_half
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+        mov             x15, x30
+
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  0
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+.if \w == 4
+        mov             w13, #\eob_half
+.endif
+        movrel          x5,  X(inv_\txfm2\()_\w\()h_x\h\()_neon)
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_48 w, h
+def_fn_48 \w, \h, dct, dct, 13
+def_fn_48 \w, \h, identity, identity, 13
+def_fn_48 \w, \h, dct, adst, 13
+def_fn_48 \w, \h, dct, flipadst, 13
+def_fn_48 \w, \h, dct, identity, 4
+def_fn_48 \w, \h, adst, dct, 13
+def_fn_48 \w, \h, adst, adst, 13
+def_fn_48 \w, \h, adst, flipadst, 13
+def_fn_48 \w, \h, flipadst, dct, 13
+def_fn_48 \w, \h, flipadst, adst, 13
+def_fn_48 \w, \h, flipadst, flipadst, 13
+def_fn_48 \w, \h, identity, dct, 16
+def_fn_48 \w, \h, adst, identity, 4
+def_fn_48 \w, \h, flipadst, identity, 4
+def_fn_48 \w, \h, identity, adst, 16
+def_fn_48 \w, \h, identity, flipadst, 16
+.endm
+
+def_fns_48 4, 8
+def_fns_48 8, 4
+
+
+function inv_dct_4s_x16_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        idct_8          v16, v18, v20, v22, v24, v26, v28, v30
+
+        ld1             {v0.4s, v1.4s}, [x16]
+        sub             x16, x16, #32
+
+        mul_mls         v2,  v17, v31, v0.s[0], v0.s[1] // -> t8a
+        mul_mla         v4,  v17, v31, v0.s[1], v0.s[0] // -> t15a
+        mul_mls         v6,  v25, v23, v0.s[2], v0.s[3] // -> t9a
+        srshr           v17.4s, v2.4s,  #12             // t8a
+        srshr           v31.4s, v4.4s,  #12             // t15a
+        mul_mla         v2,  v25, v23, v0.s[3], v0.s[2] // -> t14a
+        mul_mls         v4,  v21, v27, v1.s[0], v1.s[1] // -> t10a
+        srshr           v23.4s, v6.4s,  #12             // t9a
+        srshr           v25.4s, v2.4s,  #12             // t14a
+        mul_mla         v6,  v21, v27, v1.s[1], v1.s[0] // -> t13a
+        mul_mls         v2,  v29, v19, v1.s[2], v1.s[3] // -> t11a
+        srshr           v21.4s, v4.4s,  #12             // t10a
+        srshr           v27.4s, v6.4s,  #12             // t13a
+        mul_mla         v4,  v29, v19, v1.s[3], v1.s[2] // -> t12a
+        srshr           v19.4s, v2.4s,  #12             // t11a
+        srshr           v29.4s, v4.4s,  #12             // t12a
+
+        ld1             {v0.4s}, [x16]
+
+        sqsub           v2.4s,   v17.4s,  v23.4s  // t9
+        sqadd           v17.4s,  v17.4s,  v23.4s  // t8
+        sqsub           v3.4s,   v31.4s,  v25.4s  // t14
+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15
+        sqsub           v23.4s,  v19.4s,  v21.4s  // t10
+        sqadd           v19.4s,  v19.4s,  v21.4s  // t11
+        sqadd           v25.4s,  v29.4s,  v27.4s  // t12
+        sqsub           v29.4s,  v29.4s,  v27.4s  // t13
+
+        mul_mls         v4,  v3,  v2,  v0.s[2], v0.s[3] // -> t9a
+        mul_mla         v6,  v3,  v2,  v0.s[3], v0.s[2] // -> t14a
+        srshr           v21.4s, v4.4s,  #12             // t9a
+        srshr           v27.4s, v6.4s,  #12             // t14a
+
+        mul_mls         v4,  v29, v23, v0.s[2], v0.s[3] // -> t13a
+        mul_mla         v6,  v29, v23, v0.s[3], v0.s[2] // -> t10a
+        srshr           v29.4s, v4.4s,  #12             // t13a
+        neg             v6.4s,   v6.4s
+        srshr           v23.4s, v6.4s,  #12             // t10a
+
+        sqsub           v2.4s,   v17.4s,  v19.4s  // t11a
+        sqadd           v17.4s,  v17.4s,  v19.4s  // t8a
+        sqsub           v3.4s,   v31.4s,  v25.4s  // t12a
+        sqadd           v31.4s,  v31.4s,  v25.4s  // t15a
+        sqadd           v19.4s,  v21.4s,  v23.4s  // t9
+        sqsub           v21.4s,  v21.4s,  v23.4s  // t10
+        sqsub           v25.4s,  v27.4s,  v29.4s  // t13
+        sqadd           v27.4s,  v27.4s,  v29.4s  // t14
+
+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t11
+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t12
+        mul_mls         v2,  v25, v21, v0.s[0], v0.s[0] // -> t10a
+
+        srshr           v4.4s,  v4.4s,  #12   // t11
+        srshr           v5.4s,  v6.4s,  #12   // t12
+        mul_mla         v6,  v25, v21, v0.s[0], v0.s[0] // -> t10a
+        srshr           v2.4s,  v2.4s,  #12   // t10a
+        srshr           v3.4s,  v6.4s,  #12   // t13a
+
+        sqadd           v6.4s,   v16.4s,  v31.4s  // out0
+        sqsub           v31.4s,  v16.4s,  v31.4s  // out15
+        mov             v16.16b, v6.16b
+        sqadd           v23.4s,  v30.4s,  v17.4s  // out7
+        sqsub           v7.4s,   v30.4s,  v17.4s  // out8
+        sqadd           v17.4s,  v18.4s,  v27.4s  // out1
+        sqsub           v30.4s,  v18.4s,  v27.4s  // out14
+        sqadd           v18.4s,  v20.4s,  v3.4s   // out2
+        sqsub           v29.4s,  v20.4s,  v3.4s   // out13
+        sqadd           v3.4s,   v28.4s,  v19.4s  // out6
+        sqsub           v25.4s,  v28.4s,  v19.4s  // out9
+        sqadd           v19.4s,  v22.4s,  v5.4s   // out3
+        sqsub           v28.4s,  v22.4s,  v5.4s   // out12
+        sqadd           v20.4s,  v24.4s,  v4.4s   // out4
+        sqsub           v27.4s,  v24.4s,  v4.4s   // out11
+        sqadd           v21.4s,  v26.4s,  v2.4s   // out5
+        sqsub           v26.4s,  v26.4s,  v2.4s   // out10
+        mov             v24.16b, v7.16b
+        mov             v22.16b, v3.16b
+
+        ret
+endfunc
+
+.macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
+        movrel          x16, iadst16_coeffs
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mla         v2,  v31, v16, v0.s[0], v0.s[1] // -> t0
+        mul_mls         v4,  v31, v16, v0.s[1], v0.s[0] // -> t1
+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t2
+        srshr           v16.4s, v2.4s,  #12             // t0
+        srshr           v31.4s, v4.4s,  #12             // t1
+        mul_mls         v2,  v29, v18, v0.s[3], v0.s[2] // -> t3
+        mul_mla         v4,  v27, v20, v1.s[0], v1.s[1] // -> t4
+        srshr           v18.4s, v6.4s,  #12             // t2
+        srshr           v29.4s, v2.4s,  #12             // t3
+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t5
+        mul_mla         v2,  v25, v22, v1.s[2], v1.s[3] // -> t6
+        srshr           v20.4s, v4.4s,  #12             // t4
+        srshr           v27.4s, v6.4s,  #12             // t5
+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t7
+        ld1             {v0.4s, v1.4s}, [x16]
+        movrel          x16, idct_coeffs
+        mul_mla         v6,  v23, v24, v0.s[0], v0.s[1] // -> t8
+        srshr           v22.4s, v2.4s,  #12             // t6
+        srshr           v25.4s, v4.4s,  #12             // t7
+        mul_mls         v2,  v23, v24, v0.s[1], v0.s[0] // -> t9
+        mul_mla         v4,  v21, v26, v0.s[2], v0.s[3] // -> t10
+        srshr           v23.4s, v6.4s,  #12             // t8
+        srshr           v24.4s, v2.4s,  #12             // t9
+        mul_mls         v6,  v21, v26, v0.s[3], v0.s[2] // -> t11
+        mul_mla         v2,  v19, v28, v1.s[0], v1.s[1] // -> t12
+        srshr           v21.4s, v4.4s,  #12             // t10
+        srshr           v26.4s, v6.4s,  #12             // t11
+        mul_mls         v4,  v19, v28, v1.s[1], v1.s[0] // -> t13
+        mul_mla         v6,  v17, v30, v1.s[2], v1.s[3] // -> t14
+        srshr           v19.4s, v2.4s,  #12             // t12
+        srshr           v28.4s, v4.4s,  #12             // t13
+        mul_mls         v2,  v17, v30, v1.s[3], v1.s[2] // -> t15
+        srshr           v17.4s, v6.4s,  #12             // t14
+        srshr           v30.4s, v2.4s,  #12             // t15
+
+        ld1             {v0.4s, v1.4s}, [x16]
+
+        sqsub           v2.4s,   v16.4s,  v23.4s // t8a
+        sqadd           v16.4s,  v16.4s,  v23.4s // t0a
+        sqsub           v3.4s,   v31.4s,  v24.4s // t9a
+        sqadd           v31.4s,  v31.4s,  v24.4s // t1a
+        sqadd           v23.4s,  v18.4s,  v21.4s // t2a
+        sqsub           v18.4s,  v18.4s,  v21.4s // t10a
+        sqadd           v24.4s,  v29.4s,  v26.4s // t3a
+        sqsub           v29.4s,  v29.4s,  v26.4s // t11a
+        sqadd           v21.4s,  v20.4s,  v19.4s // t4a
+        sqsub           v20.4s,  v20.4s,  v19.4s // t12a
+        sqadd           v26.4s,  v27.4s,  v28.4s // t5a
+        sqsub           v27.4s,  v27.4s,  v28.4s // t13a
+        sqadd           v19.4s,  v22.4s,  v17.4s // t6a
+        sqsub           v22.4s,  v22.4s,  v17.4s // t14a
+        sqadd           v28.4s,  v25.4s,  v30.4s // t7a
+        sqsub           v25.4s,  v25.4s,  v30.4s // t15a
+
+        mul_mla         v4,  v2,  v3,  v1.s[1], v1.s[0] // -> t8
+        mul_mls         v6,  v2,  v3,  v1.s[0], v1.s[1] // -> t9
+        mul_mla         v2,  v18, v29, v1.s[3], v1.s[2] // -> t10
+        srshr           v17.4s, v4.4s,  #12             // t8
+        srshr           v30.4s, v6.4s,  #12             // t9
+        mul_mls         v4,  v18, v29, v1.s[2], v1.s[3] // -> t11
+        mul_mls         v6,  v27, v20, v1.s[1], v1.s[0] // -> t12
+        srshr           v18.4s, v2.4s,  #12             // t10
+        srshr           v29.4s, v4.4s,  #12             // t11
+        mul_mla         v2,  v27, v20, v1.s[0], v1.s[1] // -> t13
+        mul_mls         v4,  v25, v22, v1.s[3], v1.s[2] // -> t14
+        srshr           v27.4s, v6.4s,  #12             // t12
+        srshr           v20.4s, v2.4s,  #12             // t13
+        mul_mla         v6,  v25, v22, v1.s[2], v1.s[3] // -> t15
+        srshr           v25.4s, v4.4s,  #12             // t14
+        srshr           v22.4s, v6.4s,  #12             // t15
+
+        sqsub           v2.4s,   v16.4s,  v21.4s // t4
+        sqadd           v16.4s,  v16.4s,  v21.4s // t0
+        sqsub           v3.4s,   v31.4s,  v26.4s // t5
+        sqadd           v31.4s,  v31.4s,  v26.4s // t1
+        sqadd           v21.4s,  v23.4s,  v19.4s // t2
+        sqsub           v23.4s,  v23.4s,  v19.4s // t6
+        sqadd           v26.4s,  v24.4s,  v28.4s // t3
+        sqsub           v24.4s,  v24.4s,  v28.4s // t7
+        sqadd           v19.4s,  v17.4s,  v27.4s // t8a
+        sqsub           v17.4s,  v17.4s,  v27.4s // t12a
+        sqadd           v28.4s,  v30.4s,  v20.4s // t9a
+        sqsub           v30.4s,  v30.4s,  v20.4s // t13a
+        sqadd           v27.4s,  v18.4s,  v25.4s // t10a
+        sqsub           v18.4s,  v18.4s,  v25.4s // t14a
+        sqadd           v20.4s,  v29.4s,  v22.4s // t11a
+        sqsub           v29.4s,  v29.4s,  v22.4s // t15a
+
+        mul_mla         v4,  v2,  v3,  v0.s[3], v0.s[2] // -> t4a
+        mul_mls         v6,  v2,  v3,  v0.s[2], v0.s[3] // -> t5a
+        mul_mls         v2,  v24, v23, v0.s[3], v0.s[2] // -> t6a
+        srshr           v22.4s, v4.4s,  #12             // t4a
+        srshr           v25.4s, v6.4s,  #12             // t5a
+        mul_mla         v4,  v24, v23, v0.s[2], v0.s[3] // -> t7a
+        mul_mla         v6,  v17, v30, v0.s[3], v0.s[2] // -> t12
+        srshr           v24.4s, v2.4s,  #12             // t6a
+        srshr           v23.4s, v4.4s,  #12             // t7a
+        mul_mls         v2,  v17, v30, v0.s[2], v0.s[3] // -> t13
+        mul_mls         v4,  v29, v18, v0.s[3], v0.s[2] // -> t14
+        srshr           v17.4s, v6.4s,  #12             // t12
+        mul_mla         v6,  v29, v18, v0.s[2], v0.s[3] // -> t15
+        srshr           v29.4s, v2.4s,  #12             // t13
+        srshr           v30.4s, v4.4s,  #12             // t14
+        srshr           v18.4s, v6.4s,  #12             // t15
+
+        sqsub           v2.4s,   v16.4s,  v21.4s // t2a
+.ifc \o0, v16
+        sqadd           \o0\().4s,  v16.4s,  v21.4s // out0
+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
+.else
+        sqadd           v4.4s,      v16.4s,  v21.4s // out0
+        sqsub           v21.4s,     v31.4s,  v26.4s // t3a
+        sqadd           \o15\().4s, v31.4s,  v26.4s // out15
+        mov             \o0\().16b, v4.16b
+.endif
+        sqneg           \o15\().4s, \o15\().4s      // out15
+
+        sqsub           v3.4s,      v29.4s,  v18.4s // t15a
+        sqadd           \o13\().4s, v29.4s,  v18.4s // out13
+        sqadd           \o2\().4s,  v17.4s,  v30.4s // out2
+        sqsub           v26.4s,     v17.4s,  v30.4s // t14a
+        sqneg           \o13\().4s, \o13\().4s      // out13
+
+        sqadd           \o1\().4s,  v19.4s,  v27.4s // out1
+        sqsub           v27.4s,     v19.4s,  v27.4s // t10
+        sqadd           \o14\().4s, v28.4s,  v20.4s // out14
+        sqsub           v20.4s,     v28.4s,  v20.4s // t11
+        sqneg           \o1\().4s,  \o1\().4s       // out1
+
+        sqadd           \o3\().4s,  v22.4s,  v24.4s // out3
+        sqsub           v22.4s,     v22.4s,  v24.4s // t6
+        sqadd           \o12\().4s, v25.4s,  v23.4s // out12
+        sqsub           v23.4s,     v25.4s,  v23.4s // t7
+        sqneg           \o3\().4s,  \o3\().4s       // out3
+
+        mul_mls         v24, v2,  v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
+        mul_mla         v4,  v2,  v21, v0.s[0], v0.s[0] // -> out7 (v23 or v24)
+        mul_mla         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out5 (v21 or v26)
+
+        srshr           v24.4s, v24.4s, #12             // out8
+        srshr           v4.4s,  v4.4s,  #12             // out7
+        srshr           v5.4s,  v6.4s,  #12             // out5
+        mul_mls         v6,  v26, v3,  v0.s[0], v0.s[0] // -> out10 (v26 or v21)
+        mul_mla         v2,  v22, v23, v0.s[0], v0.s[0] // -> out4 (v20 or v27)
+        srshr           v26.4s, v6.4s,  #12             // out10
+
+        mul_mls         v6,  v22, v23, v0.s[0], v0.s[0] // -> out11 (v27 or v20)
+        mul_mla         v22, v27, v20, v0.s[0], v0.s[0] // -> out6 (v22 or v25)
+        mul_mls         v21, v27, v20, v0.s[0], v0.s[0] // -> out9 (v25 or v22)
+
+        srshr           \o4\().4s,   v2.4s,  #12        // out4
+        srshr           v6.4s,       v6.4s,  #12        // out11
+        srshr           v7.4s,       v21.4s, #12        // out9
+        srshr           \o6\().4s,   v22.4s, #12        // out6
+
+.ifc \o8, v23
+        mov             \o8\().16b,  v24.16b
+        mov             \o10\().16b, v26.16b
+.endif
+
+        sqneg           \o7\().4s,   v4.4s // out7
+        sqneg           \o5\().4s,   v5.4s // out5
+        sqneg           \o11\().4s,  v6.4s // out11
+        sqneg           \o9\().4s,   v7.4s // out9
+.endm
+
+function inv_adst_4s_x16_neon
+        iadst_16        v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+        ret
+endfunc
+
+function inv_flipadst_4s_x16_neon
+        iadst_16        v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16
+        ret
+endfunc
+
+function inv_identity_4s_x16_neon
+        movz            w16, #2*(5793-4096)*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        sqrdmulh        v2.4s,      v\i\().4s,  v0.s[0]
+        sqadd           v\i\().4s,  v\i\().4s,  v\i\().4s
+        sqadd           v\i\().4s,  v\i\().4s,  v2.4s
+.endr
+        ret
+endfunc
+
+.macro identity_4x16_shift1 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        sqrdmulh        v3.4s,   \i,      \c
+        srshr           v3.4s,   v3.4s,   #1
+        sqadd           \i,      \i,      v3.4s
+.endr
+.endm
+
+.macro identity_4x16 c
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        sqrdmulh        v3.4s,   \i,      \c
+        sqadd           \i,      \i,      \i
+        sqadd           \i,      \i,      v3.4s
+.endr
+.endm
+
+.macro def_horz_16 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_16x4_neon
+        mov             x14, x30
+        movi            v7.4s,  #0
+.if \scale
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.endif
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+.if \scale
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        blr             x4
+        sqrshrn         v16.4h,  v16.4s,  #\shift
+        sqrshrn         v17.4h,  v17.4s,  #\shift
+        sqrshrn         v18.4h,  v18.4s,  #\shift
+        sqrshrn         v19.4h,  v19.4s,  #\shift
+        sqrshrn2        v16.8h,  v20.4s,  #\shift
+        sqrshrn2        v17.8h,  v21.4s,  #\shift
+        sqrshrn2        v18.8h,  v22.4s,  #\shift
+        sqrshrn2        v19.8h,  v23.4s,  #\shift
+        sqrshrn         v20.4h,  v24.4s,  #\shift
+        sqrshrn         v21.4h,  v25.4s,  #\shift
+        sqrshrn         v22.4h,  v26.4s,  #\shift
+        sqrshrn         v23.4h,  v27.4s,  #\shift
+        sqrshrn2        v20.8h,  v28.4s,  #\shift
+        sqrshrn2        v21.8h,  v29.4s,  #\shift
+        sqrshrn2        v22.8h,  v30.4s,  #\shift
+        sqrshrn2        v23.8h,  v31.4s,  #\shift
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+        transpose_4x8h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+.irp i, v16.8h, v20.8h, v17.8h, v21.8h, v18.8h, v22.8h, v19.8h, v23.8h
+        st1             {\i}, [x6], #16
+.endr
+
+        br              x14
+endfunc
+.endm
+
+def_horz_16 scale=0, shift=2
+def_horz_16 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_8x16_neon
+        mov             x14, x30
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        blr             x5
+        load_add_store_8x16 x6, x7
+        br              x14
+endfunc
+
+function inv_txfm_add_16x16_neon
+        mov             x15, x30
+        sub             sp,  sp,  #512
+        ldrh            w12, [x13], #2
+.irp i, 0, 4, 8, 12
+        add             x6,  sp,  #(\i*16*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 12
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #16*4
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+3:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+const eob_16x16
+        .short 10, 36, 78, 256
+endconst
+
+const eob_16x16_identity
+        .short 4, 8, 12, 256
+endconst
+
+.macro def_fn_16x16 txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         16,  16,  2
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x16_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x16_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_16x16
+.else
+        movrel          x13, eob_16x16_identity
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_16x16_identity
+.else
+        movrel          x13, eob_16x16
+.endif
+.endif
+        b               inv_txfm_add_16x16_neon
+endfunc
+.endm
+
+def_fn_16x16 dct, dct
+def_fn_16x16 identity, identity
+def_fn_16x16 dct, adst
+def_fn_16x16 dct, flipadst
+def_fn_16x16 dct, identity
+def_fn_16x16 adst, dct
+def_fn_16x16 adst, adst
+def_fn_16x16 adst, flipadst
+def_fn_16x16 flipadst, dct
+def_fn_16x16 flipadst, adst
+def_fn_16x16 flipadst, flipadst
+def_fn_16x16 identity, dct
+
+function inv_txfm_add_16x4_neon
+        mov             x15, x30
+        movi            v4.4s,  #0
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], #16
+.endr
+
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        mov             x6,  x0
+        load_add_store_8x4 x6, x7
+
+        sqrshrn         v16.4h,  v24.4s,  #1
+        sqrshrn         v17.4h,  v25.4s,  #1
+        sqrshrn         v18.4h,  v26.4s,  #1
+        sqrshrn         v19.4h,  v27.4s,  #1
+        sqrshrn2        v16.8h,  v28.4s,  #1
+        sqrshrn2        v17.8h,  v29.4s,  #1
+        sqrshrn2        v18.8h,  v30.4s,  #1
+        sqrshrn2        v19.8h,  v31.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        blr             x5
+        add             x6,  x0,  #16
+        load_add_store_8x4 x6, x7
+
+        br              x15
+endfunc
+
+function inv_txfm_add_4x16_neon
+        ldrh            w12, [x13, #4]
+        mov             x15, x30
+
+        mov             x11, #64
+
+        cmp             w3,  w12
+        ldrh            w12, [x13, #2]
+        b.lt            1f
+
+        add             x6,  x2,  #48
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v28.4h,  v16.4s,  #1
+        rshrn           v29.4h,  v17.4s,  #1
+        rshrn           v30.4h,  v18.4s,  #1
+        rshrn           v31.4h,  v19.4s,  #1
+        transpose_4x4h  v28, v29, v30, v31, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v28.4h, v29.4h, v30.4h, v31.4h
+        movi            \i,  #0
+.endr
+2:
+        cmp             w3,  w12
+        ldrh            w12, [x13, #0]
+        b.lt            1f
+
+        add             x6,  x2,  #32
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v24.4h,  v16.4s,  #1
+        rshrn           v25.4h,  v17.4s,  #1
+        rshrn           v26.4h,  v18.4s,  #1
+        rshrn           v27.4h,  v19.4s,  #1
+        transpose_4x4h  v24, v25, v26, v27, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v24.4h, v25.4h, v26.4h, v27.4h
+        movi            \i,  #0
+.endr
+2:
+        cmp             w3,  w12
+        b.lt            1f
+
+        add             x6,  x2,  #16
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x6]
+        st1             {v2.4s}, [x6], x11
+.endr
+        blr             x4
+        rshrn           v20.4h,  v16.4s,  #1
+        rshrn           v21.4h,  v17.4s,  #1
+        rshrn           v22.4h,  v18.4s,  #1
+        rshrn           v23.4h,  v19.4s,  #1
+        transpose_4x4h  v20, v21, v22, v23, v4,  v5,  v6,  v7
+
+        b               2f
+1:
+.irp i, v20.4h, v21.4h, v22.4h, v23.4h
+        movi            \i,  #0
+.endr
+2:
+
+        movi            v2.4s,   #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s
+        ld1             {\i},    [x2]
+        st1             {v2.4s}, [x2], x11
+.endr
+        blr             x4
+        rshrn           v16.4h,  v16.4s,  #1
+        rshrn           v17.4h,  v17.4s,  #1
+        rshrn           v18.4h,  v18.4s,  #1
+        rshrn           v19.4h,  v19.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v4,  v5,  v6,  v7
+
+        blr             x5
+
+        load_add_store_4x16 x0, x6
+
+        br              x15
+endfunc
+
+const eob_4x16
+        .short 13, 29, 45, 64
+endconst
+
+const eob_4x16_identity1
+        .short 16, 32, 48, 64
+endconst
+
+const eob_4x16_identity2
+        .short 4, 8, 12, 64
+endconst
+
+.macro def_fn_416 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+.if \w == 4
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_4h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_4x16
+.else
+        movrel          x13, eob_4x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_4x16_identity2
+.else
+        movrel          x13, eob_4x16
+.endif
+.endif
+.else
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
+.endif
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_416 w, h
+def_fn_416 \w, \h, dct, dct
+def_fn_416 \w, \h, identity, identity
+def_fn_416 \w, \h, dct, adst
+def_fn_416 \w, \h, dct, flipadst
+def_fn_416 \w, \h, dct, identity
+def_fn_416 \w, \h, adst, dct
+def_fn_416 \w, \h, adst, adst
+def_fn_416 \w, \h, adst, flipadst
+def_fn_416 \w, \h, flipadst, dct
+def_fn_416 \w, \h, flipadst, adst
+def_fn_416 \w, \h, flipadst, flipadst
+def_fn_416 \w, \h, identity, dct
+def_fn_416 \w, \h, adst, identity
+def_fn_416 \w, \h, flipadst, identity
+def_fn_416 \w, \h, identity, adst
+def_fn_416 \w, \h, identity, flipadst
+.endm
+
+def_fns_416 4, 16
+def_fns_416 16, 4
+
+
+function inv_txfm_add_16x8_neon
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x40]!
+        stp             d10, d11, [sp, #0x10]
+        stp             d12, d13, [sp, #0x20]
+        stp             d14, d15, [sp, #0x30]
+
+        cmp             w3,  w13
+        mov             x11, #32
+        b.lt            1f
+
+        movi            v4.4s,  #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+
+        add             x6,  x2,  #16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+        blr             x4
+
+        sqrshrn         v8.4h,   v16.4s,  #1
+        sqrshrn         v9.4h,   v17.4s,  #1
+        sqrshrn         v10.4h,  v18.4s,  #1
+        sqrshrn         v11.4h,  v19.4s,  #1
+        sqrshrn2        v8.8h,   v20.4s,  #1
+        sqrshrn2        v9.8h,   v21.4s,  #1
+        sqrshrn2        v10.8h,  v22.4s,  #1
+        sqrshrn2        v11.8h,  v23.4s,  #1
+        sqrshrn         v12.4h,  v24.4s,  #1
+        sqrshrn         v13.4h,  v25.4s,  #1
+        sqrshrn         v14.4h,  v26.4s,  #1
+        sqrshrn         v15.4h,  v27.4s,  #1
+        sqrshrn2        v12.8h,  v28.4s,  #1
+        sqrshrn2        v13.8h,  v29.4s,  #1
+        sqrshrn2        v14.8h,  v30.4s,  #1
+        sqrshrn2        v15.8h,  v31.4s,  #1
+
+        transpose_4x8h  v8,  v9,  v10, v11, v2,  v3,  v4,  v5
+        transpose_4x8h  v12, v13, v14, v15, v2,  v3,  v4,  v5
+
+        b               2f
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h, v12.8h, v13.8h, v14.8h, v15.8h
+        movi            \i,  #0
+.endr
+2:
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+
+        movi            v4.4s,  #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], x11
+.endr
+
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+
+        mov             v20.16b, v8.16b
+        mov             v21.16b, v9.16b
+        mov             v22.16b, v10.16b
+        mov             v23.16b, v11.16b
+
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+
+        sqrshrn         v8.4h,   v24.4s,  #1
+        sqrshrn         v9.4h,   v25.4s,  #1
+        sqrshrn         v10.4h,  v26.4s,  #1
+        sqrshrn         v11.4h,  v27.4s,  #1
+        sqrshrn2        v8.8h,   v28.4s,  #1
+        sqrshrn2        v9.8h,   v29.4s,  #1
+        sqrshrn2        v10.8h,  v30.4s,  #1
+        sqrshrn2        v11.8h,  v31.4s,  #1
+
+        transpose_4x8h  v8,  v9, v10, v11, v2,  v3,  v4,  v5
+
+        blr             x5
+
+        mov             x6,  x0
+        load_add_store_8x8 x6, x7
+
+        mov             v16.16b, v8.16b
+        mov             v17.16b, v9.16b
+        mov             v18.16b, v10.16b
+        mov             v19.16b, v11.16b
+        mov             v20.16b, v12.16b
+        mov             v21.16b, v13.16b
+        mov             v22.16b, v14.16b
+        mov             v23.16b, v15.16b
+
+        blr             x5
+
+        add             x0,  x0,  #16
+        load_add_store_8x8 x0, x7
+
+        ldp             d14, d15, [sp, #0x30]
+        ldp             d12, d13, [sp, #0x20]
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x40
+        br              x15
+endfunc
+
+function inv_txfm_add_8x16_neon
+        mov             x15, x30
+        stp             d8,  d9,  [sp, #-0x20]!
+        stp             d10, d11, [sp, #0x10]
+        ldrh            w12, [x13, #4]
+
+        mov             x11, #64
+
+        cmp             w3,  w12
+        ldrh            w12, [x13, #2]
+        b.lt            1f
+
+        add             x6,  x2,  #48
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v28.4h,  v16.4s,  #1
+        sqrshrn         v29.4h,  v17.4s,  #1
+        sqrshrn         v30.4h,  v18.4s,  #1
+        sqrshrn         v31.4h,  v19.4s,  #1
+        sqrshrn2        v28.8h,  v20.4s,  #1
+        sqrshrn2        v29.8h,  v21.4s,  #1
+        sqrshrn2        v30.8h,  v22.4s,  #1
+        sqrshrn2        v31.8h,  v23.4s,  #1
+        transpose_4x8h  v28, v29, v30, v31, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v28.8h, v29.8h, v30.8h, v31.8h
+        movi            \i,  #0
+.endr
+
+2:
+        cmp             w3,  w12
+        ldrh            w12, [x13, #0]
+        b.lt            1f
+
+        add             x6,  x2,  #32
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v24.4h,  v16.4s,  #1
+        sqrshrn         v25.4h,  v17.4s,  #1
+        sqrshrn         v26.4h,  v18.4s,  #1
+        sqrshrn         v27.4h,  v19.4s,  #1
+        sqrshrn2        v24.8h,  v20.4s,  #1
+        sqrshrn2        v25.8h,  v21.4s,  #1
+        sqrshrn2        v26.8h,  v22.4s,  #1
+        sqrshrn2        v27.8h,  v23.4s,  #1
+        transpose_4x8h  v24, v25, v26, v27, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v24.8h, v25.8h, v26.8h, v27.8h
+        movi            \i,  #0
+.endr
+
+2:
+        cmp             w3,  w12
+        b.lt            1f
+
+        add             x6,  x2,  #16
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x6]
+        st1             {v4.4s}, [x6], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v8.4h,   v16.4s,  #1
+        sqrshrn         v9.4h,   v17.4s,  #1
+        sqrshrn         v10.4h,  v18.4s,  #1
+        sqrshrn         v11.4h,  v19.4s,  #1
+        sqrshrn2        v8.8h,   v20.4s,  #1
+        sqrshrn2        v9.8h,   v21.4s,  #1
+        sqrshrn2        v10.8h,  v22.4s,  #1
+        sqrshrn2        v11.8h,  v23.4s,  #1
+        transpose_4x8h  v8,  v9,  v10, v11, v2, v3, v4, v5
+
+        b               2f
+
+1:
+.irp i, v8.8h, v9.8h, v10.8h, v11.8h
+        movi            \i,  #0
+.endr
+
+2:
+        movi            v4.4s,   #0
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+        ld1             {\i},    [x2]
+        st1             {v4.4s}, [x2], x11
+.endr
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        blr             x4
+
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn         v17.4h,  v17.4s,  #1
+        sqrshrn         v18.4h,  v18.4s,  #1
+        sqrshrn         v19.4h,  v19.4s,  #1
+        sqrshrn2        v16.8h,  v20.4s,  #1
+        sqrshrn2        v17.8h,  v21.4s,  #1
+        sqrshrn2        v18.8h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        transpose_4x8h  v16, v17, v18, v19, v2, v3, v4, v5
+
+        mov             v20.16b, v8.16b
+        mov             v21.16b, v9.16b
+        mov             v22.16b, v10.16b
+        mov             v23.16b, v11.16b
+
+        blr             x5
+
+        load_add_store_8x16 x0, x6
+
+        ldp             d10, d11, [sp, #0x10]
+        ldp             d8,  d9,  [sp], 0x20
+
+        br              x15
+endfunc
+
+const eob_8x16
+        .short 10, 43, 75, 128
+endconst
+
+const eob_8x16_identity1
+        .short 4, 64, 96, 128
+endconst
+
+const eob_8x16_identity2
+        .short 4, 8, 12, 128
+endconst
+
+.macro def_fn_816 w, h, txfm1, txfm2
+function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_16bpc_neon, export=1
+.ifc \txfm1\()_\txfm2, dct_dct
+        idct_dc         \w,  \h,  1
+.endif
+        adr             x4,  inv_\txfm1\()_4s_x\w\()_neon
+        movrel          x5,  X(inv_\txfm2\()_8h_x\h\()_neon)
+.ifc \txfm1, identity
+.ifc \txfm2, identity
+        movrel          x13, eob_8x16
+.else
+        movrel          x13, eob_8x16_identity1
+.endif
+.else
+.ifc \txfm2, identity
+        movrel          x13, eob_8x16_identity2
+.else
+        movrel          x13, eob_8x16
+.endif
+.endif
+.if \h == 8
+        ldrh            w13, [x13]
+.endif
+        b               inv_txfm_add_\w\()x\h\()_neon
+endfunc
+.endm
+
+.macro def_fns_816 w, h
+def_fn_816 \w, \h, dct, dct
+def_fn_816 \w, \h, identity, identity
+def_fn_816 \w, \h, dct, adst
+def_fn_816 \w, \h, dct, flipadst
+def_fn_816 \w, \h, dct, identity
+def_fn_816 \w, \h, adst, dct
+def_fn_816 \w, \h, adst, adst
+def_fn_816 \w, \h, adst, flipadst
+def_fn_816 \w, \h, flipadst, dct
+def_fn_816 \w, \h, flipadst, adst
+def_fn_816 \w, \h, flipadst, flipadst
+def_fn_816 \w, \h, identity, dct
+def_fn_816 \w, \h, adst, identity
+def_fn_816 \w, \h, flipadst, identity
+def_fn_816 \w, \h, identity, adst
+def_fn_816 \w, \h, identity, flipadst
+.endm
+
+def_fns_816 8, 16
+def_fns_816 16, 8
+
+function inv_dct32_odd_4s_x16_neon
+        movrel          x16, idct_coeffs, 4*16
+        ld1             {v0.4s, v1.4s}, [x16], #32
+
+        mul_mls         v2,  v16, v31, v0.s[0], v0.s[1] // -> t16a
+        mul_mla         v4,  v16, v31, v0.s[1], v0.s[0] // -> t31a
+        mul_mls         v6,  v24, v23, v0.s[2], v0.s[3] // -> t17a
+        srshr           v16.4s, v2.4s,  #12             // t16a
+        srshr           v31.4s, v4.4s,  #12             // t31a
+        mul_mla         v2,  v24, v23, v0.s[3], v0.s[2] // -> t30a
+        mul_mls         v4,  v20, v27, v1.s[0], v1.s[1] // -> t18a
+        srshr           v24.4s, v6.4s,  #12             // t17a
+        srshr           v23.4s, v2.4s,  #12             // t30a
+        mul_mla         v6,  v20, v27, v1.s[1], v1.s[0] // -> t29a
+        mul_mls         v2,  v28, v19, v1.s[2], v1.s[3] // -> t19a
+        srshr           v20.4s, v4.4s,  #12             // t18a
+        srshr           v27.4s, v6.4s,  #12             // t29a
+        mul_mla         v4,  v28, v19, v1.s[3], v1.s[2] // -> t28a
+        ld1             {v0.4s, v1.4s}, [x16]
+        sub             x16, x16, #4*24
+        mul_mls         v6,  v18, v29, v0.s[0], v0.s[1] // -> t20a
+        srshr           v28.4s, v2.4s,  #12             // t19a
+        srshr           v19.4s, v4.4s,  #12             // t28a
+        mul_mla         v2,  v18, v29, v0.s[1], v0.s[0] // -> t27a
+        mul_mls         v4,  v26, v21, v0.s[2], v0.s[3] // -> t21a
+        srshr           v18.4s, v6.4s,  #12             // t20a
+        srshr           v29.4s, v2.4s,  #12             // t27a
+        mul_mla         v6,  v26, v21, v0.s[3], v0.s[2] // -> t26a
+        mul_mls         v2,  v22, v25, v1.s[0], v1.s[1] // -> t22a
+        srshr           v26.4s, v4.4s,  #12             // t21a
+        srshr           v21.4s, v6.4s,  #12             // t26a
+        mul_mla         v4,  v22, v25, v1.s[1], v1.s[0] // -> t25a
+        mul_mls         v6,  v30, v17, v1.s[2], v1.s[3] // -> t23a
+        srshr           v22.4s, v2.4s,  #12             // t22a
+        srshr           v25.4s, v4.4s,  #12             // t25a
+        mul_mla         v2,  v30, v17, v1.s[3], v1.s[2] // -> t24a
+        srshr           v30.4s, v6.4s,  #12             // t23a
+        srshr           v17.4s, v2.4s,  #12             // t24a
+
+        ld1             {v0.4s, v1.4s}, [x16]
+
+        sqsub           v2.4s,   v16.4s,  v24.4s // t17
+        sqadd           v16.4s,  v16.4s,  v24.4s // t16
+        sqsub           v3.4s,   v31.4s,  v23.4s // t30
+        sqadd           v31.4s,  v31.4s,  v23.4s // t31
+        sqsub           v24.4s,  v28.4s,  v20.4s // t18
+        sqadd           v28.4s,  v28.4s,  v20.4s // t19
+        sqadd           v23.4s,  v18.4s,  v26.4s // t20
+        sqsub           v18.4s,  v18.4s,  v26.4s // t21
+        sqsub           v20.4s,  v30.4s,  v22.4s // t22
+        sqadd           v30.4s,  v30.4s,  v22.4s // t23
+        sqadd           v26.4s,  v17.4s,  v25.4s // t24
+        sqsub           v17.4s,  v17.4s,  v25.4s // t25
+        sqsub           v22.4s,  v29.4s,  v21.4s // t26
+        sqadd           v29.4s,  v29.4s,  v21.4s // t27
+        sqadd           v25.4s,  v19.4s,  v27.4s // t28
+        sqsub           v19.4s,  v19.4s,  v27.4s // t29
+
+        mul_mls         v4,  v3,  v2,  v1.s[0], v1.s[1] // -> t17a
+        mul_mla         v6,  v3,  v2,  v1.s[1], v1.s[0] // -> t30a
+        mul_mla         v2,  v19, v24, v1.s[1], v1.s[0] // -> t18a
+        srshr           v21.4s, v4.4s,  #12             // t17a
+        srshr           v27.4s, v6.4s,  #12             // t30a
+        neg             v2.4s,   v2.4s                  // -> t18a
+        mul_mls         v4,  v19, v24, v1.s[0], v1.s[1] // -> t29a
+        mul_mls         v6,  v22, v18, v1.s[2], v1.s[3] // -> t21a
+        srshr           v19.4s, v2.4s,  #12             // t18a
+        srshr           v24.4s, v4.4s,  #12             // t29a
+        mul_mla         v2,  v22, v18, v1.s[3], v1.s[2] // -> t26a
+        mul_mla         v4,  v17, v20, v1.s[3], v1.s[2] // -> t22a
+        srshr           v22.4s, v6.4s,  #12             // t21a
+        srshr           v18.4s, v2.4s,  #12             // t26a
+        neg             v4.4s,   v4.4s                  // -> t22a
+        mul_mls         v6,  v17, v20, v1.s[2], v1.s[3] // -> t25a
+        srshr           v17.4s, v4.4s,  #12             // t22a
+        srshr           v20.4s, v6.4s,  #12             // t25a
+
+        sqsub           v2.4s,   v27.4s,  v24.4s // t29
+        sqadd           v27.4s,  v27.4s,  v24.4s // t30
+        sqsub           v3.4s,   v21.4s,  v19.4s // t18
+        sqadd           v21.4s,  v21.4s,  v19.4s // t17
+        sqsub           v24.4s,  v16.4s,  v28.4s // t19a
+        sqadd           v16.4s,  v16.4s,  v28.4s // t16a
+        sqsub           v19.4s,  v30.4s,  v23.4s // t20a
+        sqadd           v30.4s,  v30.4s,  v23.4s // t23a
+        sqsub           v28.4s,  v17.4s,  v22.4s // t21
+        sqadd           v17.4s,  v17.4s,  v22.4s // t22
+        sqadd           v23.4s,  v26.4s,  v29.4s // t24a
+        sqsub           v26.4s,  v26.4s,  v29.4s // t27a
+        sqadd           v22.4s,  v20.4s,  v18.4s // t25
+        sqsub           v20.4s,  v20.4s,  v18.4s // t26
+        sqsub           v29.4s,  v31.4s,  v25.4s // t28a
+        sqadd           v31.4s,  v31.4s,  v25.4s // t31a
+
+        mul_mls         v4,  v2,  v3,  v0.s[2], v0.s[3] // -> t18a
+        mul_mla         v6,  v2,  v3,  v0.s[3], v0.s[2] // -> t29a
+        mul_mls         v2,  v29, v24, v0.s[2], v0.s[3] // -> t19
+        srshr           v18.4s, v4.4s,  #12             // t18a
+        srshr           v25.4s, v6.4s,  #12             // t29a
+        mul_mla         v4,  v29, v24, v0.s[3], v0.s[2] // -> t28
+        mul_mla         v6,  v26, v19, v0.s[3], v0.s[2] // -> t20
+        srshr           v29.4s, v2.4s,  #12             // t19
+        srshr           v24.4s, v4.4s,  #12             // t28
+        neg             v6.4s,   v6.4s                  // -> t20
+        mul_mls         v2,  v26, v19, v0.s[2], v0.s[3] // -> t27
+        mul_mla         v4,  v20, v28, v0.s[3], v0.s[2] // -> t21a
+        srshr           v26.4s, v6.4s,  #12             // t20
+        srshr           v19.4s, v2.4s,  #12             // t27
+        neg             v4.4s,   v4.4s                  // -> t21a
+        mul_mls         v6,  v20, v28, v0.s[2], v0.s[3] // -> t26a
+        srshr           v20.4s, v4.4s,  #12             // t21a
+        srshr           v28.4s, v6.4s,  #12             // t26a
+
+        sqsub           v2.4s,   v16.4s,  v30.4s // t23
+        sqadd           v16.4s,  v16.4s,  v30.4s // t16 = out16
+        sqsub           v3.4s,   v31.4s,  v23.4s // t24
+        sqadd           v31.4s,  v31.4s,  v23.4s // t31 = out31
+        sqsub           v23.4s,  v21.4s,  v17.4s // t22a
+        sqadd           v17.4s,  v21.4s,  v17.4s // t17a = out17
+        sqadd           v30.4s,  v27.4s,  v22.4s // t30a = out30
+        sqsub           v21.4s,  v27.4s,  v22.4s // t25a
+        sqsub           v27.4s,  v18.4s,  v20.4s // t21
+        sqadd           v18.4s,  v18.4s,  v20.4s // t18 = out18
+        sqadd           v4.4s,   v29.4s,  v26.4s // t19a = out19
+        sqsub           v26.4s,  v29.4s,  v26.4s // t20a
+        sqadd           v29.4s,  v25.4s,  v28.4s // t29 = out29
+        sqsub           v25.4s,  v25.4s,  v28.4s // t26
+        sqadd           v28.4s,  v24.4s,  v19.4s // t28a = out28
+        sqsub           v24.4s,  v24.4s,  v19.4s // t27a
+        mov             v19.16b, v4.16b          // out19
+
+        mul_mls         v4,  v24, v26, v0.s[0], v0.s[0] // -> t20
+        mul_mla         v6,  v24, v26, v0.s[0], v0.s[0] // -> t27
+        srshr           v20.4s, v4.4s,  #12             // t20
+        srshr           v22.4s, v6.4s,  #12             // t27
+
+        mul_mla         v4,  v25, v27, v0.s[0], v0.s[0] // -> t26a
+        mul_mls         v6,  v25, v27, v0.s[0], v0.s[0] // -> t21a
+        mov             v27.16b,  v22.16b               // t27
+        srshr           v26.4s, v4.4s,  #12             // t26a
+
+        mul_mls         v24, v21, v23, v0.s[0], v0.s[0] // -> t22
+        mul_mla         v4,  v21, v23, v0.s[0], v0.s[0] // -> t25
+        srshr           v21.4s, v6.4s,  #12             // t21a
+        srshr           v22.4s, v24.4s, #12             // t22
+        srshr           v25.4s, v4.4s,  #12             // t25
+
+        mul_mls         v4,  v3,  v2,  v0.s[0], v0.s[0] // -> t23a
+        mul_mla         v6,  v3,  v2,  v0.s[0], v0.s[0] // -> t24a
+        srshr           v23.4s, v4.4s,  #12             // t23a
+        srshr           v24.4s, v6.4s,  #12             // t24a
+
+        ret
+endfunc
+
+.macro def_horz_32 scale=0, shift=2, suffix
+function inv_txfm_horz\suffix\()_dct_32x4_neon
+        mov             x14, x30
+        movi            v7.4s,  #0
+        lsl             x8,  x8,  #1
+.if \scale
+        movz            w16, #2896*8, lsl #16
+        dup             v0.2s,   w16
+.endif
+
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+.if \scale
+        scale_input     .4s, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct_4s_x16_neon
+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
+        transpose_4x4s  v24, v25, v26, v27, v2,  v3,  v4,  v5
+        transpose_4x4s  v28, v29, v30, v31, v2,  v3,  v4,  v5
+
+.macro store1 r0, r1, r2, r3
+        st1             {\r0}, [x6], #16
+        st1             {\r1}, [x6], #16
+        st1             {\r2}, [x6], #16
+        st1             {\r3}, [x6], #16
+.endm
+        store1          v16.4s,  v20.4s,  v24.4s,  v28.4s
+        store1          v17.4s,  v21.4s,  v25.4s,  v29.4s
+        store1          v18.4s,  v22.4s,  v26.4s,  v30.4s
+        store1          v19.4s,  v23.4s,  v27.4s,  v31.4s
+.purgem store1
+        sub             x6,  x6,  #64*4
+
+        movi            v7.4s,  #0
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        ld1             {\i}, [x7]
+        st1             {v7.4s}, [x7], x8
+.endr
+.if \scale
+        // This relies on the fact that the idct also leaves the right coeff in v0.s[1]
+        scale_input     .4s, v0.s[1], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v0.s[1], v24, v25, v26, v27, v28, v29, v30, v31
+.endif
+        bl              inv_dct32_odd_4s_x16_neon
+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
+        transpose_4x4s  v23, v22, v21, v20, v2,  v3,  v4,  v5
+        transpose_4x4s  v19, v18, v17, v16, v2,  v3,  v4,  v5
+.macro store2 r0, r1, r2, r3, shift
+        ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+        sqsub           v4.4s,   v0.4s,   \r0
+        sqadd           v0.4s,   v0.4s,   \r0
+        sqsub           v5.4s,   v1.4s,   \r1
+        sqadd           v1.4s,   v1.4s,   \r1
+        sqsub           v6.4s,   v2.4s,   \r2
+        sqadd           v2.4s,   v2.4s,   \r2
+        sqsub           v7.4s,   v3.4s,   \r3
+        sqadd           v3.4s,   v3.4s,   \r3
+        sqrshrn         v0.4h,   v0.4s,   #\shift
+        sqrshrn2        v0.8h,   v1.4s,   #\shift
+        sqrshrn         v1.4h,   v2.4s,   #\shift
+        sqrshrn2        v1.8h,   v3.4s,   #\shift
+        sqrshrn         v2.4h,   v7.4s,   #\shift
+        sqrshrn2        v2.8h,   v6.4s,   #\shift
+        sqrshrn         v3.4h,   v5.4s,   #\shift
+        sqrshrn2        v3.8h,   v4.4s,   #\shift
+        st1             {v0.8h, v1.8h}, [x6], #32
+        rev64           v2.8h,   v2.8h
+        rev64           v3.8h,   v3.8h
+        st1             {v2.8h, v3.8h}, [x6], #32
+.endm
+
+        store2          v31.4s,  v27.4s,  v23.4s,  v19.4s,  \shift
+        store2          v30.4s,  v26.4s,  v22.4s,  v18.4s,  \shift
+        store2          v29.4s,  v25.4s,  v21.4s,  v17.4s,  \shift
+        store2          v28.4s,  v24.4s,  v20.4s,  v16.4s,  \shift
+.purgem store2
+        br              x14
+endfunc
+.endm
+
+def_horz_32 scale=0, shift=2
+def_horz_32 scale=1, shift=1, suffix=_scale
+
+function inv_txfm_add_vert_dct_8x32_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+
+        bl              X(inv_dct_8h_x16_neon)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        st1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        add             x7,  x7,  x8, lsr #1
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        sub             x7,  x7,  x8, lsl #4
+        sub             x7,  x7,  x8, lsr #1
+        bl              X(inv_dct32_odd_8h_x16_neon)
+
+        neg             x9,  x8
+        mov             x10, x6
+        movi            v0.8h,   #0
+        mvni            v1.8h,   #0xfc, lsl #8 // 0x3ff
+.macro combine r0, r1, r2, r3, op, stride
+        ld1             {v5.8h}, [x7],    \stride
+        ld1             {v2.8h}, [x10],   x1
+        ld1             {v6.8h}, [x7],    \stride
+        ld1             {v3.8h}, [x10],   x1
+        \op             v5.8h,   v5.8h,   \r0
+        ld1             {v7.8h}, [x7],    \stride
+        ld1             {v4.8h}, [x10],   x1
+        srshr           v5.8h,   v5.8h,   #4
+        \op             v6.8h,   v6.8h,   \r1
+        sqadd           v5.8h,   v5.8h,   v2.8h
+        srshr           v6.8h,   v6.8h,   #4
+        \op             v7.8h,   v7.8h,   \r2
+        smax            v2.8h,   v5.8h,   v0.8h
+        ld1             {v5.8h}, [x7],    \stride
+        sqadd           v6.8h,   v6.8h,   v3.8h
+        smin            v2.8h,   v2.8h,   v1.8h
+        srshr           v7.8h,   v7.8h,   #4
+        \op             v5.8h,   v5.8h,   \r3
+        st1             {v2.8h}, [x6],    x1
+        ld1             {v2.8h}, [x10],   x1
+        smax            v3.8h,   v6.8h,   v0.8h
+        sqadd           v7.8h,   v7.8h,   v4.8h
+        smin            v3.8h,   v3.8h,   v1.8h
+        srshr           v5.8h,   v5.8h,   #4
+        st1             {v3.8h}, [x6],    x1
+        smax            v4.8h,   v7.8h,   v0.8h
+        sqadd           v5.8h,   v5.8h,   v2.8h
+        smin            v4.8h,   v4.8h,   v1.8h
+        st1             {v4.8h}, [x6],    x1
+        smax            v2.8h,   v5.8h,   v0.8h
+        smin            v2.8h,   v2.8h,   v1.8h
+        st1             {v2.8h}, [x6],    x1
+.endm
+        combine         v31.8h, v30.8h, v29.8h, v28.8h, sqadd, x8
+        combine         v27.8h, v26.8h, v25.8h, v24.8h, sqadd, x8
+        combine         v23.8h, v22.8h, v21.8h, v20.8h, sqadd, x8
+        combine         v19.8h, v18.8h, v17.8h, v16.8h, sqadd, x8
+        sub             x7,  x7,  x8
+        combine         v16.8h, v17.8h, v18.8h, v19.8h, sqsub, x9
+        combine         v20.8h, v21.8h, v22.8h, v23.8h, sqsub, x9
+        combine         v24.8h, v25.8h, v26.8h, v27.8h, sqsub, x9
+        combine         v28.8h, v29.8h, v30.8h, v31.8h, sqsub, x9
+.purgem combine
+
+        br              x14
+endfunc
+
+const eob_32x32
+        .short 10, 36, 78, 136, 210, 300, 406, 1024
+endconst
+
+const eob_16x32
+        .short 10, 36, 78, 151, 215, 279, 343, 512
+endconst
+
+const eob_16x32_shortside
+        .short 10, 36, 78, 512
+endconst
+
+const eob_8x32
+        .short 10, 43, 75, 107, 139, 171, 203, 256
+endconst
+
+function inv_txfm_add_identity_identity_32x32_16bpc_neon, export=1
+        movi            v0.8h,  #0
+        movi            v1.8h,  #0
+        movrel          x13, eob_32x32, 2
+
+        mov             x8,  #4*32
+1:
+        mov             w9,  #0
+        movrel          x12, eob_32x32, 2
+2:
+        add             w9,  w9,  #8
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+        load_add_store_8x8 x0, x7, shiftbits=2
+        ldrh            w11, [x12], #4
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #2*8
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #4
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw #1
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #4*8
+        b               1b
+9:
+        ret
+endfunc
+
+.macro shift_16_regs op, shift
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        \op             \i,  \i,  #\shift
+.endr
+.endm
+
+.macro def_identity_1632 w, h, wshort, hshort
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+        movz            w16, #2896*8, lsl #16
+        movz            w17, #2*(5793-4096)*8, lsl #16
+        movi            v0.4s,   #0
+        movi            v1.4s,   #0
+        movrel          x13, eob_16x32\hshort, 2
+
+        mov             x8,  #4*\h
+1:
+        mov             w9,  #0
+        movrel          x12, eob_16x32\wshort, 2
+2:
+        add             w9,  w9,  #8
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        dup             v2.2s,   w16
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        mov             v2.s[1], w17
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        scale_input     .4s, v2.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+        scale_input     .4s, v2.s[0], v24, v25, v26, v27, v28, v29, v30, v31
+
+.if \w == 16
+        // 16x32
+        identity_4x16_shift1 v2.s[1]
+.else
+        // 32x16
+        shift_16_regs   sqshl, 1
+        identity_4x16   v2.s[1]
+.endif
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+.if \w == 16
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=4
+.endif
+        ldrh            w11, [x12], #4
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #16
+        cmp             w3,  w11
+        b.ge            2b
+
+        ldrh            w11, [x13], #4
+        cmp             w3,  w11
+        b.lt            9f
+
+        sub             x0,  x0,  w9, uxtw #1
+        add             x0,  x0,  x1, lsl #3
+        msub            x2,  x8,  x9,  x2
+        add             x2,  x2,  #4*8
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+def_identity_1632 16, 32, _shortside,
+def_identity_1632 32, 16, , _shortside
+
+.macro def_identity_832 w, h
+function inv_txfm_add_identity_identity_\w\()x\h\()_16bpc_neon, export=1
+        movi            v0.4s,  #0
+        movi            v1.4s,  #0
+        // Working on 8x8 blocks, read every other entry from eob_8x32
+        movrel          x13, eob_8x32, 2
+
+        mov             w8,  #4*\h
+1:
+        // Working on 8x8 blocks, read every other entry from eob_8x32
+        ldrh            w12, [x13], #4
+        ld1             {v16.4s, v17.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v18.4s, v19.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v20.4s, v21.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v22.4s, v23.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v24.4s, v25.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v26.4s, v27.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v28.4s, v29.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+        ld1             {v30.4s, v31.4s}, [x2]
+        st1             {v0.4s, v1.4s},   [x2], x8
+
+.if \w == 8
+        sqrshrn         v16.4h,  v16.4s,  #1
+        sqrshrn2        v16.8h,  v17.4s,  #1
+        sqrshrn         v17.4h,  v18.4s,  #1
+        sqrshrn2        v17.8h,  v19.4s,  #1
+        sqrshrn         v18.4h,  v20.4s,  #1
+        sqrshrn2        v18.8h,  v21.4s,  #1
+        sqrshrn         v19.4h,  v22.4s,  #1
+        sqrshrn2        v19.8h,  v23.4s,  #1
+        sqrshrn         v20.4h,  v24.4s,  #1
+        sqrshrn2        v20.8h,  v25.4s,  #1
+        sqrshrn         v21.4h,  v26.4s,  #1
+        sqrshrn2        v21.8h,  v27.4s,  #1
+        sqrshrn         v22.4h,  v28.4s,  #1
+        sqrshrn2        v22.8h,  v29.4s,  #1
+        sqrshrn         v23.4h,  v30.4s,  #1
+        sqrshrn2        v23.8h,  v31.4s,  #1
+.else
+        sqxtn           v16.4h,  v16.4s
+        sqxtn2          v16.8h,  v17.4s
+        sqxtn           v17.4h,  v18.4s
+        sqxtn2          v17.8h,  v19.4s
+        sqxtn           v18.4h,  v20.4s
+        sqxtn2          v18.8h,  v21.4s
+        sqxtn           v19.4h,  v22.4s
+        sqxtn2          v19.8h,  v23.4s
+        sqxtn           v20.4h,  v24.4s
+        sqxtn2          v20.8h,  v25.4s
+        sqxtn           v21.4h,  v26.4s
+        sqxtn2          v21.8h,  v27.4s
+        sqxtn           v22.4h,  v28.4s
+        sqxtn2          v22.8h,  v29.4s
+        sqxtn           v23.4h,  v30.4s
+        sqxtn2          v23.8h,  v31.4s
+.endif
+
+        transpose_8x8h  v16, v17, v18, v19, v20, v21, v22, v23, v4, v5
+
+
+        cmp             w3,  w12
+.if \w == 8
+        load_add_store_8x8 x0, x7, shiftbits=2
+.else
+        load_add_store_8x8 x0, x7, shiftbits=3
+.endif
+
+        b.lt            9f
+.if \w == 8
+        sub             x2,  x2,  x8, lsl #3
+        add             x2,  x2,  #4*8
+.else
+        sub             x0,  x0,  x1, lsl #3
+        add             x0,  x0,  #2*8
+.endif
+        b               1b
+
+9:
+        ret
+endfunc
+.endm
+
+def_identity_832 8, 32
+def_identity_832 32, 8
+
+function inv_txfm_add_dct_dct_32x32_16bpc_neon, export=1
+        idct_dc         32,  32,  2
+
+        mov             x15, x30
+        sub             sp,  sp,  #2048
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  sp,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #2048
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x32_16bpc_neon, export=1
+        idct_dc         16,  32,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+        adr             x4,  inv_dct_4s_x16_neon
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  sp,  #(\i*16*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endif
+        mov             x8,  #4*32
+        bl              inv_txfm_horz_scale_16x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #16*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x16_16bpc_neon, export=1
+        idct_dc         32,  16,  1
+
+        mov             x15, x30
+        sub             sp,  sp,  #1024
+
+        movrel          x13, eob_16x32
+        movrel          x5,  X(inv_dct_8h_x16_neon)
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12
+        add             x6,  sp,  #(\i*32*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        mov             x8,  #4*16
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x6,  x0,  #(\i*2)
+        add             x7,  sp,  #(\i*2)
+        mov             x8,  #32*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  sp,  #1024
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_8x32_16bpc_neon, export=1
+        idct_dc         8,   32, 2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+        movrel          x13, eob_8x32
+
+        movi            v28.4s,  #0
+        mov             x8,  #4*32
+        mov             w9,  #32
+        mov             x6,  sp
+        mov             x7,  x2
+1:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().4s}, [x7]
+        st1             {v28.4s}, [x7], x8
+.endr
+        ldrh            w12, [x13], #2
+        sub             w9,  w9,  #4
+        sub             x7,  x7,  x8, lsl #3
+        add             x7,  x7,  #4*4
+
+        bl              inv_dct_4s_x8_neon
+
+        sqrshrn         v16.4h,  v16.4s,  #2
+        sqrshrn         v17.4h,  v17.4s,  #2
+        sqrshrn         v18.4h,  v18.4s,  #2
+        sqrshrn         v19.4h,  v19.4s,  #2
+        sqrshrn2        v16.8h,  v20.4s,  #2
+        sqrshrn2        v17.8h,  v21.4s,  #2
+        sqrshrn2        v18.8h,  v22.4s,  #2
+        sqrshrn2        v19.8h,  v23.4s,  #2
+
+        transpose_4x8h  v16, v17, v18, v19, v2,  v3,  v4,  v5
+
+        cmp             w3,  w12
+        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x6], #64
+
+        b.ge            1b
+        cbz             w9,  3f
+
+        movi            v29.8h,  #0
+        movi            v30.8h,  #0
+        movi            v31.8h,  #0
+2:
+        subs            w9,  w9,  #4
+        st1             {v28.8h,v29.8h,v30.8h,v31.8h}, [x6], #64
+        b.gt            2b
+
+3:
+        mov             x6,  x0
+        mov             x7,  sp
+        mov             x8,  #8*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x8_16bpc_neon, export=1
+        idct_dc         32,  8,   2
+
+        mov             x15, x30
+        sub             sp,  sp,  #512
+
+.irp i, 0, 4
+        add             x6,  sp,  #(\i*32*2)
+        add             x7,  x2,  #(\i*4)
+.if \i > 0
+        cmp             w3,  #10
+        b.lt            1f
+.endif
+        mov             x8,  #8*4
+        bl              inv_txfm_horz_dct_32x4_neon
+.endr
+        b               2f
+
+1:
+        movi            v4.8h,   #0
+        movi            v5.8h,   #0
+        movi            v6.8h,   #0
+        movi            v7.8h,   #0
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+
+2:
+        mov             x8,  #2*32
+        mov             w9,  #0
+1:
+        add             x6,  x0,  x9, lsl #1
+        add             x7,  sp,  x9, lsl #1 // #(\i*2)
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x7], x8
+.endr
+        add             w9,  w9,  #8
+
+        bl              X(inv_dct_8h_x8_neon)
+
+        cmp             w9,  #32
+
+        load_add_store_8x8 x6, x7
+
+        b.lt            1b
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function inv_dct64_step1_neon
+        // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
+        // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
+        // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
+        // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
+
+        ld1             {v0.4s, v1.4s}, [x17], #32
+
+        sqrdmulh        v23.4s,  v16.4s,  v0.s[1]       // t63a
+        sqrdmulh        v16.4s,  v16.4s,  v0.s[0]       // t32a
+        sqrdmulh        v22.4s,  v17.4s,  v0.s[2]       // t62a
+        sqrdmulh        v17.4s,  v17.4s,  v0.s[3]       // t33a
+        sqrdmulh        v21.4s,  v18.4s,  v1.s[1]       // t61a
+        sqrdmulh        v18.4s,  v18.4s,  v1.s[0]       // t34a
+        sqrdmulh        v20.4s,  v19.4s,  v1.s[2]       // t60a
+        sqrdmulh        v19.4s,  v19.4s,  v1.s[3]       // t35a
+
+        ld1             {v0.4s}, [x17], #16
+
+        sqadd           v24.4s,  v16.4s,  v17.4s        // t32
+        sqsub           v25.4s,  v16.4s,  v17.4s        // t33
+        sqsub           v26.4s,  v19.4s,  v18.4s        // t34
+        sqadd           v27.4s,  v19.4s,  v18.4s        // t35
+        sqadd           v28.4s,  v20.4s,  v21.4s        // t60
+        sqsub           v29.4s,  v20.4s,  v21.4s        // t61
+        sqsub           v30.4s,  v23.4s,  v22.4s        // t62
+        sqadd           v31.4s,  v23.4s,  v22.4s        // t63
+
+        mul_mla         v2,  v29, v26, v0.s[0], v0.s[1] // -> t34a
+        mul_mls         v4,  v29, v26, v0.s[1], v0.s[0] // -> t61a
+        neg             v2.4s,   v2.4s                  // t34a
+        mul_mls         v6,  v30, v25, v0.s[1], v0.s[0] // -> t33a
+        srshr           v26.4s, v2.4s,  #12             // t34a
+        mul_mla         v2,  v30, v25, v0.s[0], v0.s[1] // -> t62a
+        srshr           v29.4s, v4.4s,  #12             // t61a
+        srshr           v25.4s, v6.4s,  #12             // t33a
+        srshr           v30.4s, v2.4s,  #12             // t62a
+
+        sqadd           v16.4s,  v24.4s,  v27.4s        // t32a
+        sqsub           v19.4s,  v24.4s,  v27.4s        // t35a
+        sqadd           v17.4s,  v25.4s,  v26.4s        // t33
+        sqsub           v18.4s,  v25.4s,  v26.4s        // t34
+        sqsub           v20.4s,  v31.4s,  v28.4s        // t60a
+        sqadd           v23.4s,  v31.4s,  v28.4s        // t63a
+        sqsub           v21.4s,  v30.4s,  v29.4s        // t61
+        sqadd           v22.4s,  v30.4s,  v29.4s        // t62
+
+        mul_mla         v2,  v21, v18, v0.s[2], v0.s[3] // -> t61a
+        mul_mls         v4,  v21, v18, v0.s[3], v0.s[2] // -> t34a
+        mul_mla         v6,  v20, v19, v0.s[2], v0.s[3] // -> t60
+        srshr           v21.4s, v2.4s,  #12             // t61a
+        srshr           v18.4s, v4.4s,  #12             // t34a
+        mul_mls         v2,  v20, v19, v0.s[3], v0.s[2] // -> t35
+        srshr           v20.4s, v6.4s,  #12             // t60
+        srshr           v19.4s, v2.4s,  #12             // t35
+
+        st1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x6], #64
+        st1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x6], #64
+
+        ret
+endfunc
+
+function inv_dct64_step2_neon
+        movrel          x16, idct_coeffs
+        ld1             {v0.4s}, [x16]
+1:
+        // t32a/33/34a/35/60/61a/62/63a
+        // t56a/57/58a/59/36/37a/38/39a
+        // t40a/41/42a/43/52/53a/54/55a
+        // t48a/49/50a/51/44/45a/46/47a
+        ldr             q16, [x6, #4*4*0]  // t32a
+        ldr             q17, [x9, #4*4*8]  // t39a
+        ldr             q18, [x9, #4*4*0]  // t63a
+        ldr             q19, [x6, #4*4*8]  // t56a
+        ldr             q20, [x6, #4*4*16] // t40a
+        ldr             q21, [x9, #4*4*24] // t47a
+        ldr             q22, [x9, #4*4*16] // t55a
+        ldr             q23, [x6, #4*4*24] // t48a
+
+        sqadd           v24.4s,  v16.4s, v17.4s         // t32
+        sqsub           v25.4s,  v16.4s, v17.4s         // t39
+        sqadd           v26.4s,  v18.4s, v19.4s         // t63
+        sqsub           v27.4s,  v18.4s, v19.4s         // t56
+        sqsub           v28.4s,  v21.4s, v20.4s         // t40
+        sqadd           v29.4s,  v21.4s, v20.4s         // t47
+        sqadd           v30.4s,  v23.4s, v22.4s         // t48
+        sqsub           v31.4s,  v23.4s, v22.4s         // t55
+
+        mul_mla         v2,  v27, v25, v0.s[3], v0.s[2] // -> t56a
+        mul_mls         v4,  v27, v25, v0.s[2], v0.s[3] // -> t39a
+        mul_mla         v6,  v31, v28, v0.s[3], v0.s[2] // -> t40a
+        srshr           v25.4s, v2.4s,  #12             // t56a
+        srshr           v27.4s, v4.4s,  #12             // t39a
+        neg             v6.4s,   v6.4s                  // t40a
+        mul_mls         v2,  v31, v28, v0.s[2], v0.s[3] // -> t55a
+        srshr           v31.4s, v6.4s,  #12             // t40a
+        srshr           v28.4s, v2.4s,  #12             // t55a
+
+        sqadd           v16.4s,  v24.4s,  v29.4s        // t32a
+        sqsub           v19.4s,  v24.4s,  v29.4s        // t47a
+        sqadd           v17.4s,  v27.4s,  v31.4s        // t39
+        sqsub           v18.4s,  v27.4s,  v31.4s        // t40
+        sqsub           v20.4s,  v26.4s,  v30.4s        // t48a
+        sqadd           v23.4s,  v26.4s,  v30.4s        // t63a
+        sqsub           v21.4s,  v25.4s,  v28.4s        // t55
+        sqadd           v22.4s,  v25.4s,  v28.4s        // t56
+
+        mul_mls         v2,  v21, v18, v0.s[0], v0.s[0] // -> t40a
+        mul_mla         v4,  v21, v18, v0.s[0], v0.s[0] // -> t55a
+        mul_mls         v6,  v20, v19, v0.s[0], v0.s[0] // -> t47
+        srshr           v18.4s, v2.4s,  #12             // t40a
+        srshr           v21.4s, v4.4s,  #12             // t55a
+        mul_mla         v2,  v20, v19, v0.s[0], v0.s[0] // -> t48
+        srshr           v19.4s, v6.4s,  #12             // t47
+        srshr           v20.4s, v2.4s,  #12             // t48
+
+        str             q16, [x6, #4*4*0]  // t32a
+        str             q17, [x9, #4*4*0]  // t39
+        str             q18, [x6, #4*4*8]  // t40a
+        str             q19, [x9, #4*4*8]  // t47
+        str             q20, [x6, #4*4*16] // t48
+        str             q21, [x9, #4*4*16] // t55a
+        str             q22, [x6, #4*4*24] // t56
+        str             q23, [x9, #4*4*24] // t63a
+
+        add             x6,  x6,  #4*4
+        sub             x9,  x9,  #4*4
+        cmp             x6,  x9
+        b.lt            1b
+        ret
+endfunc
+
+.macro load8 src, strd, zero, clear
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s
+.if \clear
+        ld1             {\i}, [\src]
+        st1             {\zero}, [\src], \strd
+.else
+        ld1             {\i}, [\src], \strd
+.endif
+.endr
+.endm
+
+.macro store16 dst
+.irp i, v16.4s, v17.4s, v18.4s, v19.4s, v20.4s, v21.4s, v22.4s, v23.4s, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        st1             {\i}, [\dst], #16
+.endr
+.endm
+
+.macro clear_upper8
+.irp i, v24.4s, v25.4s, v26.4s, v27.4s, v28.4s, v29.4s, v30.4s, v31.4s
+        movi            \i,  #0
+.endr
+.endm
+
+.macro movi_if reg, val, cond
+.if \cond
+        movi            \reg, \val
+.endif
+.endm
+
+.macro movz16dup_if reg, gpr, val, cond
+.if \cond
+        movz            \gpr, \val, lsl #16
+        dup             \reg, \gpr
+.endif
+.endm
+
+.macro st1_if regs, dst, cond
+.if \cond
+        st1             \regs, \dst
+.endif
+.endm
+
+.macro str_if reg, dst, cond
+.if \cond
+        str             \reg, \dst
+.endif
+.endm
+
+.macro stroff_if reg, dst, dstoff, cond
+.if \cond
+        str             \reg, \dst, \dstoff
+.endif
+.endm
+
+.macro scale_if cond, c, r0, r1, r2, r3, r4, r5, r6, r7
+.if \cond
+        scale_input     .4s, \c, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
+.endif
+.endm
+
+.macro def_dct64_func suffix, clear=0, scale=0
+function inv_txfm_dct\suffix\()_4s_x64_neon
+        mov             x14, x30
+        mov             x6,  sp
+        lsl             x8,  x8,  #2
+
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        load8           x7,  x8,  v7.4s, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        add             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct_4s_x16_neon
+
+        store16         x6
+
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.8h,  #0, \clear
+        load8           x7,  x8,  v7.4s, \clear
+        clear_upper8
+        sub             x7,  x7,  x8, lsl #3
+        lsr             x8,  x8,  #1
+        sub             x7,  x7,  x8, lsr #1
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19, v20, v21, v22, v23
+
+        bl              inv_dct32_odd_4s_x16_neon
+
+        add             x10, x6,  #16*15
+        sub             x6,  x6,  #16*16
+
+        mov             x9,  #-16
+
+.macro store_addsub r0, r1, r2, r3
+        ld1             {v2.4s}, [x6], #16
+        ld1             {v3.4s}, [x6], #16
+        sqadd           v6.4s,  v2.4s,  \r0
+        sqsub           \r0,    v2.4s,  \r0
+        ld1             {v4.4s}, [x6], #16
+        sqadd           v7.4s,  v3.4s,  \r1
+        sqsub           \r1,    v3.4s,  \r1
+        ld1             {v5.4s}, [x6], #16
+        sqadd           v2.4s,  v4.4s,  \r2
+        sub             x6,  x6,  #16*4
+        sqsub           \r2,    v4.4s,  \r2
+        st1             {v6.4s}, [x6], #16
+        st1             {\r0},   [x10], x9
+        sqadd           v3.4s,  v5.4s,  \r3
+        sqsub           \r3,    v5.4s,  \r3
+        st1             {v7.4s}, [x6], #16
+        st1             {\r1},   [x10], x9
+        st1             {v2.4s}, [x6], #16
+        st1             {\r2},   [x10], x9
+        st1             {v3.4s}, [x6], #16
+        st1             {\r3},   [x10], x9
+.endm
+        store_addsub    v31.4s, v30.4s, v29.4s, v28.4s
+        store_addsub    v27.4s, v26.4s, v25.4s, v24.4s
+        store_addsub    v23.4s, v22.4s, v21.4s, v20.4s
+        store_addsub    v19.4s, v18.4s, v17.4s, v16.4s
+.purgem store_addsub
+
+        add             x6,  x6,  #4*4*16
+
+        movrel          x17, idct64_coeffs
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        add             x9,  x7,  x8, lsl #4 // offset 16
+        add             x10, x7,  x8, lsl #3 // offset 8
+        sub             x9,  x9,  x8         // offset 15
+        sub             x11, x10, x8         // offset 7
+        ld1             {v16.4s}, [x7]  // in1  (offset 0)
+        ld1             {v17.4s}, [x9]  // in31 (offset 15)
+        ld1             {v18.4s}, [x10] // in17 (offset 8)
+        ld1             {v19.4s}, [x11] // in15 (offset 7)
+        st1_if          {v7.4s}, [x7],  \clear
+        st1_if          {v7.4s}, [x9],  \clear
+        st1_if          {v7.4s}, [x10], \clear
+        st1_if          {v7.4s}, [x11], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        add             x7,  x7,  x8, lsl #2 // offset 4
+        sub             x9,  x9,  x8, lsl #2 // offset 11
+        sub             x10, x7,  x8         // offset 3
+        add             x11, x9,  x8         // offset 12
+        ld1             {v16.4s}, [x10] // in7  (offset 3)
+        ld1             {v17.4s}, [x11] // in25 (offset 12)
+        ld1             {v18.4s}, [x9]  // in23 (offset 11)
+        ld1             {v19.4s}, [x7]  // in9  (offset 4)
+        st1_if          {v7.4s}, [x7],  \clear
+        st1_if          {v7.4s}, [x9],  \clear
+        st1_if          {v7.4s}, [x10], \clear
+        st1_if          {v7.4s}, [x11], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        sub             x10, x10, x8, lsl #1 // offset 1
+        sub             x9,  x9,  x8, lsl #1 // offset 9
+        add             x7,  x7,  x8         // offset 5
+        add             x11, x11, x8         // offset 13
+        ldr             q16, [x10, x8] // in5  (offset 2)
+        ldr             q17, [x11]     // in27 (offset 13)
+        ldr             q18, [x9,  x8] // in21 (offset 10)
+        ldr             q19, [x7]      // in11 (offset 5)
+        stroff_if       q7,  [x10, x8], \clear
+        str_if          q7,  [x11],     \clear
+        stroff_if       q7,  [x9,  x8], \clear
+        str_if          q7,  [x7],      \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+        movz16dup_if    v0.2s, w16, #2896*8, \scale
+        movi_if         v7.4s,  #0, \clear
+        ldr             q16, [x10]     // in3  (offset 1)
+        ldr             q17, [x11, x8] // in29 (offset 14)
+        ldr             q18, [x9]      // in19 (offset 9)
+        ldr             q19, [x7,  x8] // in13 (offset 6)
+        str_if          q7,  [x10],     \clear
+        stroff_if       q7,  [x11, x8], \clear
+        str_if          q7,  [x9],      \clear
+        stroff_if       q7,  [x7,  x8], \clear
+        scale_if        \scale, v0.s[0], v16, v17, v18, v19
+        bl              inv_dct64_step1_neon
+
+        sub             x6,  x6,  #4*4*32
+        add             x9,  x6,  #4*4*7
+
+        bl              inv_dct64_step2_neon
+
+        br              x14
+endfunc
+.endm
+
+def_dct64_func _clear, clear=1
+def_dct64_func _clear_scale, clear=1, scale=1
+
+
+function inv_txfm_horz_dct_64x4_neon
+        mov             x14, x30
+
+        mov             x7,  sp
+        add             x8,  sp,  #4*4*(64 - 4)
+        add             x9,  x6,  #2*56
+        mov             x10, #2*64
+        mov             x11, #-4*4*4
+
+        dup             v7.4s,  w12
+1:
+        ld1             {v16.4s, v17.4s, v18.4s, v19.4s}, [x7], #64
+        ld1             {v28.4s, v29.4s, v30.4s, v31.4s}, [x8], x11
+        ld1             {v20.4s, v21.4s, v22.4s, v23.4s}, [x7], #64
+        ld1             {v24.4s, v25.4s, v26.4s, v27.4s}, [x8], x11
+        transpose_4x4s  v16, v17, v18, v19, v2,  v3,  v4,  v5
+        transpose_4x4s  v20, v21, v22, v23, v2,  v3,  v4,  v5
+        transpose_4x4s  v31, v30, v29, v28, v2,  v3,  v4,  v5
+        transpose_4x4s  v27, v26, v25, v24, v2,  v3,  v4,  v5
+
+.macro store_addsub src0, src1, src2, src3
+        sqsub           v1.4s,   \src0,   \src1
+        sqadd           v0.4s,   \src0,   \src1
+        sqsub           v3.4s,   \src2,   \src3
+        srshl           v1.4s,   v1.4s,   v7.4s
+        sqadd           v2.4s,   \src2,   \src3
+        srshl           v3.4s,   v3.4s,   v7.4s
+        srshl           v0.4s,   v0.4s,   v7.4s
+        srshl           v2.4s,   v2.4s,   v7.4s
+        sqxtn           v3.4h,   v3.4s
+        sqxtn2          v3.8h,   v1.4s
+        sqxtn           v0.4h,   v0.4s
+        sqxtn2          v0.8h,   v2.4s
+        rev64           v3.8h,   v3.8h
+        st1             {v0.8h},  [x6], x10
+        st1             {v3.8h},  [x9], x10
+.endm
+        store_addsub    v16.4s,  v31.4s,  v20.4s,  v27.4s
+        store_addsub    v17.4s,  v30.4s,  v21.4s,  v26.4s
+        store_addsub    v18.4s,  v29.4s,  v22.4s,  v25.4s
+        store_addsub    v19.4s,  v28.4s,  v23.4s,  v24.4s
+.purgem store_addsub
+        sub             x6,  x6,  x10, lsl #2
+        sub             x9,  x9,  x10, lsl #2
+        add             x6,  x6,  #16
+        sub             x9,  x9,  #16
+
+        cmp             x7,  x8
+        b.lt            1b
+        br              x14
+endfunc
+
+function inv_txfm_add_vert_dct_8x64_neon
+        mov             x14, x30
+        lsl             x8,  x8,  #1
+
+        mov             x7,  sp
+        add             x8,  sp,  #2*8*(64 - 4)
+        add             x9,  x6,  x1, lsl #6
+        sub             x9,  x9,  x1
+        neg             x10, x1
+        mov             x11, #-2*8*4
+
+1:
+        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x7], #64
+        ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x11
+        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
+        ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x8], x11
+
+        movi            v6.8h,   #0
+        mvni            v7.8h,   #0xfc, lsl #8 // 0x3ff
+.macro add_dest_addsub src0, src1, src2, src3
+        ld1             {v0.8h}, [x6], x1
+        ld1             {v1.8h}, [x9], x10
+        sqadd           v4.8h,   \src0,   \src1
+        ld1             {v2.8h}, [x6]
+        sqsub           \src0,   \src0,   \src1
+        ld1             {v3.8h}, [x9]
+        sqadd           v5.8h,   \src2,   \src3
+        sqsub           \src2,   \src2,   \src3
+        sub             x6,  x6,  x1
+        sub             x9,  x9,  x10
+        srshr           v4.8h,   v4.8h,   #4
+        srshr           v5.8h,   v5.8h,   #4
+        srshr           \src0,   \src0,   #4
+        sqadd           v0.8h,   v0.8h,   v4.8h
+        srshr           \src2,   \src2,   #4
+        sqadd           v1.8h,   v1.8h,   \src0
+        sqadd           v2.8h,   v2.8h,   v5.8h
+        smax            v0.8h,   v0.8h,   v6.8h
+        sqadd           v3.8h,   v3.8h,   \src2
+        smax            v1.8h,   v1.8h,   v6.8h
+        smin            v0.8h,   v0.8h,   v7.8h
+        smax            v2.8h,   v2.8h,   v6.8h
+        smin            v1.8h,   v1.8h,   v7.8h
+        st1             {v0.8h}, [x6], x1
+        smax            v3.8h,   v3.8h,   v6.8h
+        smin            v2.8h,   v2.8h,   v7.8h
+        st1             {v1.8h}, [x9], x10
+        smin            v3.8h,   v3.8h,   v7.8h
+        st1             {v2.8h}, [x6], x1
+        st1             {v3.8h}, [x9], x10
+.endm
+        add_dest_addsub v16.8h,  v31.8h,  v17.8h,  v30.8h
+        add_dest_addsub v18.8h,  v29.8h,  v19.8h,  v28.8h
+        add_dest_addsub v20.8h,  v27.8h,  v21.8h,  v26.8h
+        add_dest_addsub v22.8h,  v25.8h,  v23.8h,  v24.8h
+.purgem add_dest_addsub
+        cmp             x7,  x8
+        b.lt            1b
+
+        br              x14
+endfunc
+
+.macro sub_sp space
+#ifdef _WIN32
+.if \space > 8192
+        // Here, we'd need to touch two (or more) pages while decrementing
+        // the stack pointer.
+        .error          "sub_sp_align doesn't support values over 8K at the moment"
+.elseif \space > 4096
+        sub             x16, sp,  #4096
+        ldr             xzr, [x16]
+        sub             sp,  x16, #(\space - 4096)
+.else
+        sub             sp,  sp,  #\space
+.endif
+#else
+.if \space >= 4096
+        sub             sp,  sp,  #(\space)/4096*4096
+.endif
+.if (\space % 4096) != 0
+        sub             sp,  sp,  #(\space)%4096
+.endif
+#endif
+.endm
+
+function inv_txfm_add_dct_dct_64x64_16bpc_neon, export=1
+        idct_dc         64,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*4*4
+        add             x5,  sp, #64*4*4
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_4s_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x32_16bpc_neon, export=1
+        idct_dc         64,  32,  1
+
+        mov             x15, x30
+
+        sub_sp          64*32*2+64*4*4
+        add             x5,  sp, #64*4*4
+
+        movrel          x13, eob_32x32
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        mov             x12, #-1 // shift
+        bl              inv_txfm_dct_clear_scale_4s_x64_neon
+        add             x6,  x5,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 28
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i*2)
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_dct_8x32_neon
+.endr
+
+        add             sp,  x5,  #64*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_32x64_16bpc_neon, export=1
+        idct_dc         32,  64,  1
+
+        mov             x15, x30
+
+        sub_sp          32*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_32x32
+        ldrh            w12, [x13], #2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*32*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_scale_dct_32x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8, 16, 24
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #32*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #32*32*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_64x16_16bpc_neon, export=1
+        idct_dc         64,  16,  2
+
+        mov             x15, x30
+
+        sub_sp          64*16*2+64*4*4
+        add             x4,  sp, #64*4*4
+
+        movrel          x13, eob_16x32
+
+.irp i, 0, 4, 8, 12
+        add             x6,  x4,  #(\i*64*2)
+.if \i > 0
+        mov             w8,  #(16 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #16*4
+        mov             x12, #-2 // shift
+        bl              inv_txfm_dct_clear_4s_x64_neon
+        add             x6,  x4,  #(\i*64*2)
+        bl              inv_txfm_horz_dct_64x4_neon
+.if \i < 12
+        ldrh            w12, [x13], #2
+.endif
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #2
+.rept 4
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+        movrel          x5,  X(inv_dct_8h_x16_neon)
+.irp i, 0, 8, 16, 24, 32, 40, 48, 56
+        add             x6,  x0,  #(\i*2)
+        add             x7,  x4,  #(\i*2)
+        mov             x8,  #64*2
+        bl              inv_txfm_add_vert_8x16_neon
+.endr
+
+        add             sp,  x4,  #64*16*2
+        br              x15
+endfunc
+
+function inv_txfm_add_dct_dct_16x64_16bpc_neon, export=1
+        idct_dc         16,  64,  2
+
+        mov             x15, x30
+
+        sub_sp          16*32*2+64*8*2
+        add             x5,  sp, #64*8*2
+
+        movrel          x13, eob_16x32
+        ldrh            w12, [x13], #2
+
+        adr             x4,  inv_dct_4s_x16_neon
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             x6,  x5,  #(\i*16*2)
+.if \i > 0
+        mov             w8,  #(32 - \i)
+        cmp             w3,  w12
+        b.lt            1f
+        ldrh            w12, [x13], #2
+.endif
+        add             x7,  x2,  #(\i*4)
+        mov             x8,  #32*4
+        bl              inv_txfm_horz_16x4_neon
+.endr
+        b               3f
+
+1:
+        movi            v4.8h,  #0
+        movi            v5.8h,  #0
+        movi            v6.8h,  #0
+        movi            v7.8h,  #0
+2:
+        subs            w8,  w8,  #4
+.rept 2
+        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], #64
+.endr
+        b.gt            2b
+
+3:
+.irp i, 0, 8
+        add             x7,  x5,  #(\i*2)
+        mov             x8,  #16*2
+        bl              X(inv_txfm_dct_8h_x64_neon)
+        add             x6,  x0,  #(\i*2)
+        bl              inv_txfm_add_vert_dct_8x64_neon
+.endr
+
+        add             sp,  x5,  #16*32*2
+        br              x15
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/mc.S b/ffmpeg/JNI/dav1d/src/arm/64/mc.S
index 92aa8aa81..f6970de3c 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/mc.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/mc.S
@@ -3089,3 +3089,161 @@ endfunc
 
 warp  , 11
 warp t, 7
+
+// void dav1d_emu_edge_8bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_8bpc_neon, export=1
+        ldp             x8,  x9,  [sp]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             x12, x3,  #1           // ih - 1
+        cmp             x5,  x3
+        sub             x13, x2,  #1           // iw - 1
+        csel            x12, x12, x5,  ge      // min(y, ih - 1)
+        cmp             x4,  x2
+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+        csel            x13, x13, x4,  ge      // min(x, iw - 1)
+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
+        add             x8,  x8,  x13          // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             x10, x5,  x1           // y + bh
+        neg             x5,  x5                // -y
+        sub             x10, x10, x3           // y + bh - ih
+        sub             x12, x1,  #1           // bh - 1
+        cmp             x10, x1
+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
+        cmp             x5,  x1
+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             x11, x4,  x0           // x + bw
+        neg             x4,  x4                // -x
+        sub             x11, x11, x2           // x + bw - iw
+        sub             x13, x0,  #1           // bw - 1
+        cmp             x11, x0
+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
+        cmp             x4,  x0
+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             x1,  x1,  x5           // bh - top_ext
+        madd            x6,  x5,  x7,  x6
+        sub             x2,  x0,  x4           // bw - left_ext
+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
+
+        mov             x14, x6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        ld1r            {v0.16b}, [x8]
+        mov             x12, x6                // out = dst
+        mov             x3,  x4
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.16b}, [x12], #16
+        b.gt            1b
+.endif
+        mov             x13, x8
+        add             x12, x6,  x4           // out = dst + left_ext
+        mov             x3,  x2
+1:
+        ld1             {v0.16b, v1.16b}, [x13], #32
+        subs            x3,  x3,  #32
+        st1             {v0.16b, v1.16b}, [x12], #32
+        b.gt            1b
+.if \need_right
+        add             x3,  x8,  x2           // in + center_w
+        sub             x3,  x3,  #1           // in + center_w - 1
+        add             x12, x6,  x4           // dst + left_ext
+        ld1r            {v0.16b}, [x3]
+        add             x12, x12, x2           // out = dst + left_ext + center_w
+        mov             x3,  x11
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.16b}, [x12], #16
+        b.gt            1b
+.endif
+
+        subs            x1,  x1,  #1           // center_h--
+        add             x6,  x6,  x7
+        add             x8,  x8,  x9
+        b.gt            0b
+.endm
+
+        cbz             x4,  2f
+        // need_left
+        cbz             x11, 3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cbz             x11, 4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+
+        cbz             x10, 3f
+        // need_bottom
+        sub             x8,  x6,  x7           // ref = dst - stride
+        mov             x4,  x0
+1:
+        ld1             {v0.16b, v1.16b}, [x8], #32
+        mov             x3,  x10
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.16b, v1.16b}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
+        subs            x4,  x4,  #32          // bw -= 32
+        add             x6,  x6,  #32          // dst += 32
+        b.gt            1b
+
+3:
+        cbz             x5,  3f
+        // need_top
+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
+1:
+        ld1             {v0.16b, v1.16b}, [x14], #32
+        mov             x3,  x5
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.16b, v1.16b}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
+        subs            x0,  x0,  #32          // bw -= 32
+        add             x6,  x6,  #32          // dst += 32
+        b.gt            1b
+
+3:
+        ret
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/mc16.S b/ffmpeg/JNI/dav1d/src/arm/64/mc16.S
index 5fbc3989c..7ac186302 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/mc16.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/mc16.S
@@ -3407,3 +3407,163 @@ endfunc
 
 warp
 warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+//         const intptr_t bw, const intptr_t bh,
+//         const intptr_t iw, const intptr_t ih,
+//         const intptr_t x, const intptr_t y,
+//         pixel *dst, const ptrdiff_t dst_stride,
+//         const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+        ldp             x8,  x9,  [sp]
+
+        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+        // ref += iclip(x, 0, iw - 1)
+        sub             x12, x3,  #1           // ih - 1
+        cmp             x5,  x3
+        sub             x13, x2,  #1           // iw - 1
+        csel            x12, x12, x5,  ge      // min(y, ih - 1)
+        cmp             x4,  x2
+        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+        csel            x13, x13, x4,  ge      // min(x, iw - 1)
+        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
+        add             x8,  x8,  x13, lsl #1  // ref += iclip()
+
+        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+        // top_ext = iclip(-y, 0, bh - 1)
+        add             x10, x5,  x1           // y + bh
+        neg             x5,  x5                // -y
+        sub             x10, x10, x3           // y + bh - ih
+        sub             x12, x1,  #1           // bh - 1
+        cmp             x10, x1
+        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
+        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
+        cmp             x5,  x1
+        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
+
+        // right_ext = iclip(x + bw - iw, 0, bw - 1)
+        // left_ext = iclip(-x, 0, bw - 1)
+        add             x11, x4,  x0           // x + bw
+        neg             x4,  x4                // -x
+        sub             x11, x11, x2           // x + bw - iw
+        sub             x13, x0,  #1           // bw - 1
+        cmp             x11, x0
+        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
+        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
+        cmp             x4,  x0
+        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
+
+        // center_h = bh - top_ext - bottom_ext
+        // dst += top_ext * PXSTRIDE(dst_stride)
+        // center_w = bw - left_ext - right_ext
+        sub             x1,  x1,  x5           // bh - top_ext
+        madd            x6,  x5,  x7,  x6
+        sub             x2,  x0,  x4           // bw - left_ext
+        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
+        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
+
+        mov             x14, x6                // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+        ld1r            {v0.8h}, [x8]
+        mov             x12, x6                // out = dst
+        mov             x3,  x4
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+        mov             x13, x8
+        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
+        mov             x3,  x2
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+        subs            x3,  x3,  #32
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+        b.gt            1b
+.if \need_right
+        add             x3,  x8,  x2, lsl #1   // in + center_w
+        sub             x3,  x3,  #2           // in + center_w - 1
+        add             x12, x6,  x4, lsl #1   // dst + left_ext
+        ld1r            {v0.8h}, [x3]
+        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
+        mov             x3,  x11
+        mov             v1.16b,  v0.16b
+1:
+        subs            x3,  x3,  #16
+        st1             {v0.8h, v1.8h}, [x12], #32
+        b.gt            1b
+.endif
+
+        subs            x1,  x1,  #1           // center_h--
+        add             x6,  x6,  x7
+        add             x8,  x8,  x9
+        b.gt            0b
+.endm
+
+        cbz             x4,  2f
+        // need_left
+        cbz             x11, 3f
+        // need_left + need_right
+        v_loop          1,   1
+        b               5f
+
+2:
+        // !need_left
+        cbz             x11, 4f
+        // !need_left + need_right
+        v_loop          0,   1
+        b               5f
+
+3:
+        // need_left + !need_right
+        v_loop          1,   0
+        b               5f
+
+4:
+        // !need_left + !need_right
+        v_loop          0,   0
+
+5:
+
+        cbz             x10, 3f
+        // need_bottom
+        sub             x8,  x6,  x7           // ref = dst - stride
+        mov             x4,  x0
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+        mov             x3,  x10
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
+        subs            x4,  x4,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        cbz             x5,  3f
+        // need_top
+        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
+1:
+        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+        mov             x3,  x5
+2:
+        subs            x3,  x3,  #1
+        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+        b.gt            2b
+        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
+        subs            x0,  x0,  #32          // bw -= 32
+        add             x6,  x6,  #64          // dst += 32
+        b.gt            1b
+
+3:
+        ret
+endfunc
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/msac.S b/ffmpeg/JNI/dav1d/src/arm/64/msac.S
index 31cc46f89..3a6cf900a 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/msac.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/msac.S
@@ -118,9 +118,9 @@ endconst
 .endm
 
 .macro str_n            idx0, idx1, dstreg, dstoff, n
-        str             q\idx0,  [\dstreg, \dstoff]
+        str             \idx0,  [\dstreg, \dstoff]
 .if \n == 16
-        str             q\idx1,  [\dstreg, \dstoff + 16]
+        str             \idx1,  [\dstreg, \dstoff + 16]
 .endif
 .endm
 
@@ -150,7 +150,7 @@ function msac_decode_symbol_adapt4_neon, export=1
 
         ld1r            {v6.8h},  [x8]                            // dif >> (EC_WIN_SIZE - 16)
         movrel          x8,  bits
-        str_n           4,   5,  sp, #16, \n                      // store v values to allow indexed access
+        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
 
         ld1_n           v16, v17, x8,  .8h, \n
 
@@ -185,7 +185,7 @@ function msac_decode_symbol_adapt4_neon, export=1
         sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
 .endif
         sub_n           v4,  v5,  v4,  v5,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
-        dup             v6.8h,    w4                              // -rate
+        dup             v6\sz,    w4                              // -rate
 
         sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
         sub_n           v0,  v1,  v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
@@ -216,7 +216,7 @@ L(renorm2):
         lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
         str             w4,  [x0, #RNG]
         mvn             x7,  x7                // ~dif
-        b.ge            9f
+        b.hs            9f
 
         // refill
         ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
@@ -274,6 +274,128 @@ function msac_decode_symbol_adapt16_neon, export=1
         b               L(renorm)
 endfunc
 
+function msac_decode_hi_tok_neon, export=1
+        ld1             {v0.4h},  [x1]            // cdf
+        add             x16, x0,  #RNG
+        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
+        movrel          x17, coeffs, 30-2*3
+        mvni            v30.4h, #0x3f             // 0xffc0
+        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
+        ld1r            {v3.4h},  [x16]           // rng
+        movrel          x16, bits
+        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
+        add             x17, x0,  #DIF + 6
+        ld1             {v16.8h}, [x16]
+        mov             w13, #-24
+        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
+        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
+        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
+        sub             sp,  sp,  #48
+        ldr             w6,  [x0, #CNT]
+        ldr             x7,  [x0, #DIF]
+1:
+        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
+        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
+        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
+        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
+        str             h3,  [sp, #14]            // store original u = s->rng
+        cmhs            v2.8h,   v1.8h,   v4.8h   // c >= v
+        str             q4,  [sp, #16]            // store v values to allow indexed access
+        and             v6.16b,  v2.16b,  v16.16b // One bit per halfword set in the mask
+        addv            h6,  v6.8h                // Aggregate mask bits
+        umov            w3,  v6.h[0]
+        add             w13, w13, #5
+        rbit            w3,  w3
+        add             x8,  sp,  #16
+        clz             w15, w3                   // ret
+
+        cbz             w10, 2f
+        // update_cdf
+        movi            v5.8b, #0xff
+        mov             w4,  #-5
+        urhadd          v4.4h,   v5.4h,   v2.4h   // i >= val ? -1 : 32768
+        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
+        sub             v4.4h,   v4.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
+        dup             v6.4h,    w4              // -rate
+
+        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
+        sub             v0.4h,   v0.4h,   v2.4h   // cdf + (i >= val ? 1 : 0)
+        sshl            v4.4h,   v4.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
+        add             w9,  w9,  #1              // count + (count < 32)
+        add             v0.4h,   v0.4h,   v4.4h   // cdf + (32768 - cdf[i]) >> rate
+        st1             {v0.4h},  [x1]
+        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
+        strh            w9,  [x1, #6]
+
+2:
+        add             x8,  x8,  w15, uxtw #1
+        ldrh            w3,  [x8]              // v
+        ldurh           w4,  [x8, #-2]         // u
+        sub             w4,  w4,  w3           // rng = u - v
+        clz             w5,  w4                // clz(rng)
+        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
+        mvn             x7,  x7                // ~dif
+        add             x7,  x7,  x3, lsl #48  // ~dif + (v << 48)
+        lsl             w4,  w4,  w5           // rng << d
+        subs            w6,  w6,  w5           // cnt -= d
+        lsl             x7,  x7,  x5           // (~dif + (v << 48)) << d
+        str             w4,  [x0, #RNG]
+        dup             v3.4h,   w4
+        mvn             x7,  x7                // ~dif
+        b.hs            9f
+
+        // refill
+        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
+        add             x5,  x3,  #8
+        cmp             x5,  x4
+        b.gt            2f
+
+        ldr             x3,  [x3]              // next_bits
+        add             w8,  w6,  #23          // shift_bits = cnt + 23
+        add             w6,  w6,  #16          // cnt += 16
+        rev             x3,  x3                // next_bits = bswap(next_bits)
+        sub             x5,  x5,  x8, lsr #3   // buf_pos -= shift_bits >> 3
+        and             w8,  w8,  #24          // shift_bits &= 24
+        lsr             x3,  x3,  x8           // next_bits >>= shift_bits
+        sub             w8,  w8,  w6           // shift_bits -= 16 + cnt
+        str             x5,  [x0, #BUF_POS]
+        lsl             x3,  x3,  x8           // next_bits <<= shift_bits
+        mov             w4,  #48
+        sub             w6,  w4,  w8           // cnt = cnt + 64 - shift_bits
+        eor             x7,  x7,  x3           // dif ^= next_bits
+        b               9f
+
+2:      // refill_eob
+        mov             w14, #40
+        sub             w5,  w14, w6           // c = 40 - cnt
+3:
+        cmp             x3,  x4
+        b.ge            4f
+        ldrb            w8,  [x3], #1
+        lsl             x8,  x8,  x5
+        eor             x7,  x7,  x8
+        subs            w5,  w5,  #8
+        b.ge            3b
+
+4:      // refill_eob_end
+        str             x3,  [x0, #BUF_POS]
+        sub             w6,  w14, w5           // cnt = 40 - c
+
+9:
+        lsl             w15, w15, #1
+        sub             w15, w15, #5
+        lsr             x12, x7,  #48
+        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
+        dup             v1.8h,   w12
+        b.cc            1b                     // loop if !carry
+        add             w13, w13, #30
+        str             w6,  [x0, #CNT]
+        add             sp,  sp,  #48
+        str             x7,  [x0, #DIF]
+        lsr             w0,  w13, #1
+        ret
+endfunc
+
 function msac_decode_bool_equi_neon, export=1
         ldp             w5,  w6,  [x0, #RNG]   // + CNT
         sub             sp,  sp,  #48
diff --git a/ffmpeg/JNI/dav1d/src/arm/64/util.S b/ffmpeg/JNI/dav1d/src/arm/64/util.S
index 3332c8522..fc0e0d04f 100644
--- a/ffmpeg/JNI/dav1d/src/arm/64/util.S
+++ b/ffmpeg/JNI/dav1d/src/arm/64/util.S
@@ -170,6 +170,18 @@
         trn2            \r3\().2s,  \t5\().2s,  \t7\().2s
 .endm
 
+.macro  transpose_4x4s  r0, r1, r2, r3, t4, t5, t6, t7
+        trn1            \t4\().4s,  \r0\().4s,  \r1\().4s
+        trn2            \t5\().4s,  \r0\().4s,  \r1\().4s
+        trn1            \t6\().4s,  \r2\().4s,  \r3\().4s
+        trn2            \t7\().4s,  \r2\().4s,  \r3\().4s
+
+        trn1            \r0\().2d,  \t4\().2d,  \t6\().2d
+        trn2            \r2\().2d,  \t4\().2d,  \t6\().2d
+        trn1            \r1\().2d,  \t5\().2d,  \t7\().2d
+        trn2            \r3\().2d,  \t5\().2d,  \t7\().2d
+.endm
+
 .macro  transpose_4x8h  r0, r1, r2, r3, t4, t5, t6, t7
         trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
         trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
diff --git a/ffmpeg/JNI/dav1d/src/arm/asm.S b/ffmpeg/JNI/dav1d/src/arm/asm.S
index 6b1d46fcd..1cd0955d4 100644
--- a/ffmpeg/JNI/dav1d/src/arm/asm.S
+++ b/ffmpeg/JNI/dav1d/src/arm/asm.S
@@ -93,6 +93,7 @@
         .global EXTERN\name
 #ifdef __ELF__
         .type   EXTERN\name, %function
+        .hidden EXTERN\name
 #endif
 #if HAVE_AS_FUNC
         .func   EXTERN\name
@@ -109,7 +110,7 @@ EXTERN\name:
 \name:
 .endm
 
-.macro  const   name, align=2
+.macro  const   name, export=0, align=2
     .macro endconst
 #ifdef __ELF__
         .size   \name, . - \name
@@ -124,6 +125,13 @@ EXTERN\name:
         .const_data
 #endif
         .align          \align
+    .if \export
+        .global EXTERN\name
+#ifdef __ELF__
+        .hidden EXTERN\name
+#endif
+EXTERN\name:
+    .endif
 \name:
 .endm
 
@@ -135,4 +143,9 @@ EXTERN\name:
 
 #define X(x) CONCAT(EXTERN, x)
 
+#if ARCH_AARCH64
+#define x18 do_not_use_x18
+#define w18 do_not_use_w18
+#endif
+
 #endif /* DAV1D_SRC_ARM_ASM_S */
diff --git a/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c
index 5b3eb07b4..e42ceaf1f 100644
--- a/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/arm/ipred_init_tmpl.c
@@ -27,56 +27,56 @@
 #include "src/cpu.h"
 #include "src/ipred.h"
 
-decl_angular_ipred_fn(dav1d_ipred_dc_neon);
-decl_angular_ipred_fn(dav1d_ipred_dc_128_neon);
-decl_angular_ipred_fn(dav1d_ipred_dc_top_neon);
-decl_angular_ipred_fn(dav1d_ipred_dc_left_neon);
-decl_angular_ipred_fn(dav1d_ipred_h_neon);
-decl_angular_ipred_fn(dav1d_ipred_v_neon);
-decl_angular_ipred_fn(dav1d_ipred_paeth_neon);
-decl_angular_ipred_fn(dav1d_ipred_smooth_neon);
-decl_angular_ipred_fn(dav1d_ipred_smooth_v_neon);
-decl_angular_ipred_fn(dav1d_ipred_smooth_h_neon);
-decl_angular_ipred_fn(dav1d_ipred_filter_neon);
+decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon));
+decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon));
 
-decl_cfl_pred_fn(dav1d_ipred_cfl_neon);
-decl_cfl_pred_fn(dav1d_ipred_cfl_128_neon);
-decl_cfl_pred_fn(dav1d_ipred_cfl_top_neon);
-decl_cfl_pred_fn(dav1d_ipred_cfl_left_neon);
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon));
+decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon));
 
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_neon);
-decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_neon);
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon));
+decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon));
 
-decl_pal_pred_fn(dav1d_pal_pred_neon);
+decl_pal_pred_fn(BF(dav1d_pal_pred, neon));
 
 COLD void bitfn(dav1d_intra_pred_dsp_init_arm)(Dav1dIntraPredDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8
-    c->intra_pred[DC_PRED]       = dav1d_ipred_dc_neon;
-    c->intra_pred[DC_128_PRED]   = dav1d_ipred_dc_128_neon;
-    c->intra_pred[TOP_DC_PRED]   = dav1d_ipred_dc_top_neon;
-    c->intra_pred[LEFT_DC_PRED]  = dav1d_ipred_dc_left_neon;
-    c->intra_pred[HOR_PRED]      = dav1d_ipred_h_neon;
-    c->intra_pred[VERT_PRED]     = dav1d_ipred_v_neon;
+#if BITDEPTH == 8 || ARCH_AARCH64
+    c->intra_pred[DC_PRED]       = BF(dav1d_ipred_dc, neon);
+    c->intra_pred[DC_128_PRED]   = BF(dav1d_ipred_dc_128, neon);
+    c->intra_pred[TOP_DC_PRED]   = BF(dav1d_ipred_dc_top, neon);
+    c->intra_pred[LEFT_DC_PRED]  = BF(dav1d_ipred_dc_left, neon);
+    c->intra_pred[HOR_PRED]      = BF(dav1d_ipred_h, neon);
+    c->intra_pred[VERT_PRED]     = BF(dav1d_ipred_v, neon);
 #if ARCH_AARCH64
-    c->intra_pred[PAETH_PRED]    = dav1d_ipred_paeth_neon;
-    c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_neon;
-    c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_neon;
-    c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_neon;
-    c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_neon;
+    c->intra_pred[PAETH_PRED]    = BF(dav1d_ipred_paeth, neon);
+    c->intra_pred[SMOOTH_PRED]   = BF(dav1d_ipred_smooth, neon);
+    c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon);
+    c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon);
+    c->intra_pred[FILTER_PRED]   = BF(dav1d_ipred_filter, neon);
 
-    c->cfl_pred[DC_PRED]         = dav1d_ipred_cfl_neon;
-    c->cfl_pred[DC_128_PRED]     = dav1d_ipred_cfl_128_neon;
-    c->cfl_pred[TOP_DC_PRED]     = dav1d_ipred_cfl_top_neon;
-    c->cfl_pred[LEFT_DC_PRED]    = dav1d_ipred_cfl_left_neon;
+    c->cfl_pred[DC_PRED]         = BF(dav1d_ipred_cfl, neon);
+    c->cfl_pred[DC_128_PRED]     = BF(dav1d_ipred_cfl_128, neon);
+    c->cfl_pred[TOP_DC_PRED]     = BF(dav1d_ipred_cfl_top, neon);
+    c->cfl_pred[LEFT_DC_PRED]    = BF(dav1d_ipred_cfl_left, neon);
 
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_neon;
-    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_neon;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon);
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon);
 
-    c->pal_pred                  = dav1d_pal_pred_neon;
+    c->pal_pred                  = BF(dav1d_pal_pred, neon);
 #endif
 #endif
 }
diff --git a/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c
index f9c68e9eb..ad418f2db 100644
--- a/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/arm/itx_init_tmpl.c
@@ -29,32 +29,32 @@
 #include "src/itx.h"
 
 #define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_identity_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
 
 #define decl_itx12_fns(w, h, opt) \
 decl_itx2_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_dct_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_dct_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_dct_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
 
 #define decl_itx16_fns(w, h, opt) \
 decl_itx12_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_adst_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_flipadst_identity_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_adst_##w##x##h##_##opt); \
-decl_itx_fn(dav1d_inv_txfm_add_identity_flipadst_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
 
 #define decl_itx17_fns(w, h, opt) \
 decl_itx16_fns(w, h, opt); \
-decl_itx_fn(dav1d_inv_txfm_add_wht_wht_##w##x##h##_##opt)
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
 
 decl_itx17_fns( 4,  4, neon);
 decl_itx16_fns( 4,  8, neon);
@@ -71,16 +71,16 @@ decl_itx2_fns (32,  8, neon);
 decl_itx2_fns (32, 16, neon);
 decl_itx2_fns (32, 32, neon);
 
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_16x64_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_32x64_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x16_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x32_neon);
-decl_itx_fn(dav1d_inv_txfm_add_dct_dct_64x64_neon);
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_16x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_32x64, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x16, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));
 
-COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) {
+COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c, int bpc) {
 #define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
     c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
-        dav1d_inv_txfm_add_##type##_##w##x##h##_##ext
+        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
 
 #define assign_itx1_fn(pfx, w, h, ext) \
     assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
@@ -117,7 +117,9 @@ COLD void bitfn(dav1d_itx_dsp_init_arm)(Dav1dInvTxfmDSPContext *const c) {
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-#if BITDEPTH == 8 && ARCH_AARCH64
+    if (bpc > 10) return;
+
+#if ARCH_AARCH64 || BITDEPTH == 8
     assign_itx17_fn( ,  4,  4, neon);
     assign_itx16_fn(R,  4,  8, neon);
     assign_itx16_fn(R,  4, 16, neon);
diff --git a/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c b/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c
index b17b78125..399ad41a4 100644
--- a/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/arm/mc_init_tmpl.c
@@ -66,6 +66,8 @@ decl_w_mask_fn(BF(dav1d_w_mask_420, neon));
 decl_warp8x8_fn(BF(dav1d_warp_affine_8x8, neon));
 decl_warp8x8t_fn(BF(dav1d_warp_affine_8x8t, neon));
 
+decl_emu_edge_fn(BF(dav1d_emu_edge, neon));
+
 void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = BF(dav1d_put_##name, suffix)
@@ -109,5 +111,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
     c->w_mask[2] = BF(dav1d_w_mask_420, neon);
     c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
+    c->emu_edge = BF(dav1d_emu_edge, neon);
 #endif
 }
diff --git a/ffmpeg/JNI/dav1d/src/arm/msac.h b/ffmpeg/JNI/dav1d/src/arm/msac.h
index a243a0629..9db0bf86a 100644
--- a/ffmpeg/JNI/dav1d/src/arm/msac.h
+++ b/ffmpeg/JNI/dav1d/src/arm/msac.h
@@ -34,14 +34,16 @@ unsigned dav1d_msac_decode_symbol_adapt8_neon(MsacContext *s, uint16_t *cdf,
                                               size_t n_symbols);
 unsigned dav1d_msac_decode_symbol_adapt16_neon(MsacContext *s, uint16_t *cdf,
                                                size_t n_symbols);
+unsigned dav1d_msac_decode_hi_tok_neon(MsacContext *s, uint16_t *cdf);
 unsigned dav1d_msac_decode_bool_adapt_neon(MsacContext *s, uint16_t *cdf);
 unsigned dav1d_msac_decode_bool_equi_neon(MsacContext *s);
 unsigned dav1d_msac_decode_bool_neon(MsacContext *s, unsigned f);
 
-#if ARCH_AARCH64
+#if ARCH_AARCH64 || defined(__ARM_NEON)
 #define dav1d_msac_decode_symbol_adapt4  dav1d_msac_decode_symbol_adapt4_neon
 #define dav1d_msac_decode_symbol_adapt8  dav1d_msac_decode_symbol_adapt8_neon
 #define dav1d_msac_decode_symbol_adapt16 dav1d_msac_decode_symbol_adapt16_neon
+#define dav1d_msac_decode_hi_tok         dav1d_msac_decode_hi_tok_neon
 #define dav1d_msac_decode_bool_adapt     dav1d_msac_decode_bool_adapt_neon
 #define dav1d_msac_decode_bool_equi      dav1d_msac_decode_bool_equi_neon
 #define dav1d_msac_decode_bool           dav1d_msac_decode_bool_neon
diff --git a/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c b/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c
index 8ab9738ec..c45c7109d 100644
--- a/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/cdef_apply_tmpl.c
@@ -111,6 +111,9 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
     const int uv_idx = DAV1D_PIXEL_LAYOUT_I444 - layout;
     const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
     const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+    static const uint8_t uv_dirs[2][8] = { { 0, 1, 2, 3, 4, 5, 6, 7 },
+                                           { 7, 0, 2, 4, 5, 6, 6, 6 } };
+    const uint8_t *uv_dir = uv_dirs[layout == DAV1D_PIXEL_LAYOUT_I422];
 
     for (int bit = 0, by = by_start; by < by_end; by += 2, edges |= CDEF_HAVE_TOP) {
         const int tf = f->lf.top_pre_cdef_toggle;
@@ -199,8 +202,7 @@ void bytefn(dav1d_cdef_brow)(Dav1dFrameContext *const f,
                                     damping, edges HIGHBD_CALL_SUFFIX);
                 if (uv_lvl) {
                     assert(layout != DAV1D_PIXEL_LAYOUT_I400);
-                    const int uvdir = uv_pri_lvl ? layout == DAV1D_PIXEL_LAYOUT_I422 ?
-                        ((const uint8_t[]) { 7, 0, 2, 4, 5, 6, 6, 6 })[dir] : dir : 0;
+                    const int uvdir = uv_pri_lvl ? uv_dir[dir] : 0;
                     for (int pl = 1; pl <= 2; pl++) {
                         dsp->cdef.fb[uv_idx](bptrs[pl], f->cur.stride[1], lr_bak[bit][pl],
                                              &f->lf.cdef_line[tf][pl][bx * 4 >> ss_hor],
diff --git a/ffmpeg/JNI/dav1d/src/cpu.c b/ffmpeg/JNI/dav1d/src/cpu.c
index 35816822c..f8a909f28 100644
--- a/ffmpeg/JNI/dav1d/src/cpu.c
+++ b/ffmpeg/JNI/dav1d/src/cpu.c
@@ -31,7 +31,11 @@
 #include "src/cpu.h"
 
 static unsigned flags = 0;
-#if ARCH_X86
+
+#if __has_feature(memory_sanitizer)
+// memory sanitizer is inherently incompatible with asm
+static unsigned flags_mask = 0;
+#elif ARCH_X86
 /* Disable AVX-512 by default for the time being */
 static unsigned flags_mask = ~DAV1D_X86_CPU_FLAG_AVX512ICL;
 #else
diff --git a/ffmpeg/JNI/dav1d/src/decode.c b/ffmpeg/JNI/dav1d/src/decode.c
index 9fb157166..f6782153c 100644
--- a/ffmpeg/JNI/dav1d/src/decode.c
+++ b/ffmpeg/JNI/dav1d/src/decode.c
@@ -1998,7 +1998,6 @@ static int decode_b(Dav1dTileContext *const t,
     return 0;
 }
 
-#if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 
 #include <sanitizer/msan_interface.h>
@@ -2051,7 +2050,6 @@ static int checked_decode_b(Dav1dTileContext *const t,
 #define decode_b checked_decode_b
 
 #endif /* defined(__has_feature) */
-#endif /* __has_feature(memory_sanitizer) */
 
 static int decode_sb(Dav1dTileContext *const t, const enum BlockLevel bl,
                      const EdgeNode *const node)
@@ -3304,7 +3302,7 @@ int dav1d_submit_frame(Dav1dContext *const c) {
 #define assign_bitdepth_case(bd) \
             dav1d_cdef_dsp_init_##bd##bpc(&dsp->cdef); \
             dav1d_intra_pred_dsp_init_##bd##bpc(&dsp->ipred); \
-            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx); \
+            dav1d_itx_dsp_init_##bd##bpc(&dsp->itx, bpc); \
             dav1d_loop_filter_dsp_init_##bd##bpc(&dsp->lf); \
             dav1d_loop_restoration_dsp_init_##bd##bpc(&dsp->lr, bpc); \
             dav1d_mc_dsp_init_##bd##bpc(&dsp->mc); \
diff --git a/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm b/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm
index a6a8fb7c6..c252e5451 100644
--- a/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm
+++ b/ffmpeg/JNI/dav1d/src/ext/x86/x86inc.asm
@@ -358,7 +358,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
 %define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
 %define high_mm_regs (16*cpuflag(avx512))
 
-%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
+%macro ALLOC_STACK 0-2 0, 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
         %if %1 != 0
             %assign %%pad 0
@@ -403,7 +403,7 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
     %endif
 %endmacro
 
-%macro SETUP_STACK_POINTER 1
+%macro SETUP_STACK_POINTER 0-1 0
     %ifnum %1
         %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
diff --git a/ffmpeg/JNI/dav1d/src/ipred_tmpl.c b/ffmpeg/JNI/dav1d/src/ipred_tmpl.c
index ef076f657..50c7a3c7b 100644
--- a/ffmpeg/JNI/dav1d/src/ipred_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/ipred_tmpl.c
@@ -133,7 +133,7 @@ static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
                              const int16_t *ac, const int alpha
                              HIGHBD_DECL_SUFFIX)
 {
-    unsigned dc = dc_gen_left(topleft, height);
+    const unsigned dc = dc_gen_left(topleft, height);
     cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
 }
 
@@ -625,16 +625,12 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
     assert(filt_idx < 5);
 
     const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
-    int x, y;
-    ptrdiff_t left_stride;
-    const pixel *left, *topleft, *top;
-
-    top = &topleft_in[1];
-    for (y = 0; y < height; y += 2) {
-        topleft = &topleft_in[-y];
-        left = &topleft[-1];
-        left_stride = -1;
-        for (x = 0; x < width; x += 4) {
+    const pixel *top = &topleft_in[1];
+    for (int y = 0; y < height; y += 2) {
+        const pixel *topleft = &topleft_in[-y];
+        const pixel *left = &topleft[-1];
+        ptrdiff_t left_stride = -1;
+        for (int x = 0; x < width; x += 4) {
             const int p0 = *topleft;
             const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
             const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
@@ -643,7 +639,7 @@ static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
 
             for (int yy = 0; yy < 2; yy++) {
                 for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
-                    int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
+                    const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
                     ptr[xx] = iclip_pixel((acc + 8) >> 4);
                 }
                 ptr += PXSTRIDE(stride);
diff --git a/ffmpeg/JNI/dav1d/src/itx.h b/ffmpeg/JNI/dav1d/src/itx.h
index 3befc4209..a299629c5 100644
--- a/ffmpeg/JNI/dav1d/src/itx.h
+++ b/ffmpeg/JNI/dav1d/src/itx.h
@@ -43,8 +43,8 @@ typedef struct Dav1dInvTxfmDSPContext {
     itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
 } Dav1dInvTxfmDSPContext;
 
-bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c);
-bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c);
+bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);
+bitfn_decls(void dav1d_itx_dsp_init_arm, Dav1dInvTxfmDSPContext *c, int bpc);
 bitfn_decls(void dav1d_itx_dsp_init_x86, Dav1dInvTxfmDSPContext *c);
 
 #endif /* DAV1D_SRC_ITX_H */
diff --git a/ffmpeg/JNI/dav1d/src/itx_1d.c b/ffmpeg/JNI/dav1d/src/itx_1d.c
index 87687007d..ca14fc8c4 100644
--- a/ffmpeg/JNI/dav1d/src/itx_1d.c
+++ b/ffmpeg/JNI/dav1d/src/itx_1d.c
@@ -119,13 +119,13 @@ inv_dct8_1d_internal_c(int32_t *const c, const ptrdiff_t stride,
         t7a = ((in1 * (4017 - 4096) + in7 *  799          + 2048) >> 12) + in1;
     }
 
-    int t4  = CLIP(t4a + t5a);
-        t5a = CLIP(t4a - t5a);
-    int t7  = CLIP(t7a + t6a);
-        t6a = CLIP(t7a - t6a);
+    const int t4  = CLIP(t4a + t5a);
+              t5a = CLIP(t4a - t5a);
+    const int t7  = CLIP(t7a + t6a);
+              t6a = CLIP(t7a - t6a);
 
-    int t5  = ((t6a - t5a) * 181 + 128) >> 8;
-    int t6  = ((t6a + t5a) * 181 + 128) >> 8;
+    const int t5  = ((t6a - t5a) * 181 + 128) >> 8;
+    const int t6  = ((t6a + t5a) * 181 + 128) >> 8;
 
     const int t0 = c[0 * stride];
     const int t1 = c[2 * stride];
@@ -812,23 +812,23 @@ inv_adst8_1d_internal_c(const int32_t *const in, const ptrdiff_t in_s,
     const int in4 = in[4 * in_s], in5 = in[5 * in_s];
     const int in6 = in[6 * in_s], in7 = in[7 * in_s];
 
-    int t0a = (((4076 - 4096) * in7 +   401         * in0 + 2048) >> 12) + in7;
-    int t1a = ((  401         * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
-    int t2a = (((3612 - 4096) * in5 +  1931         * in2 + 2048) >> 12) + in5;
-    int t3a = (( 1931         * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
-    int t4a =  ( 1299         * in3 +  1583         * in4 + 1024) >> 11;
-    int t5a =  ( 1583         * in3 -  1299         * in4 + 1024) >> 11;
-    int t6a = (( 1189         * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
-    int t7a = (((3920 - 4096) * in1 -  1189         * in6 + 2048) >> 12) + in1;
-
-    int t0 = CLIP(t0a + t4a);
-    int t1 = CLIP(t1a + t5a);
-    int t2 = CLIP(t2a + t6a);
-    int t3 = CLIP(t3a + t7a);
-    int t4 = CLIP(t0a - t4a);
-    int t5 = CLIP(t1a - t5a);
-    int t6 = CLIP(t2a - t6a);
-    int t7 = CLIP(t3a - t7a);
+    const int t0a = (((4076 - 4096) * in7 +   401         * in0 + 2048) >> 12) + in7;
+    const int t1a = ((  401         * in7 - (4076 - 4096) * in0 + 2048) >> 12) - in0;
+    const int t2a = (((3612 - 4096) * in5 +  1931         * in2 + 2048) >> 12) + in5;
+    const int t3a = (( 1931         * in5 - (3612 - 4096) * in2 + 2048) >> 12) - in2;
+          int t4a =  ( 1299         * in3 +  1583         * in4 + 1024) >> 11;
+          int t5a =  ( 1583         * in3 -  1299         * in4 + 1024) >> 11;
+          int t6a = (( 1189         * in1 + (3920 - 4096) * in6 + 2048) >> 12) + in6;
+          int t7a = (((3920 - 4096) * in1 -  1189         * in6 + 2048) >> 12) + in1;
+
+    const int t0 = CLIP(t0a + t4a);
+    const int t1 = CLIP(t1a + t5a);
+          int t2 = CLIP(t2a + t6a);
+          int t3 = CLIP(t3a + t7a);
+    const int t4 = CLIP(t0a - t4a);
+    const int t5 = CLIP(t1a - t5a);
+          int t6 = CLIP(t2a - t6a);
+          int t7 = CLIP(t3a - t7a);
 
     t4a = (((3784 - 4096) * t4 +  1567         * t5 + 2048) >> 12) + t4;
     t5a = (( 1567         * t4 - (3784 - 4096) * t5 + 2048) >> 12) - t5;
diff --git a/ffmpeg/JNI/dav1d/src/itx_tmpl.c b/ffmpeg/JNI/dav1d/src/itx_tmpl.c
index 02f34e85c..a0e807f95 100644
--- a/ffmpeg/JNI/dav1d/src/itx_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/itx_tmpl.c
@@ -180,7 +180,7 @@ static void inv_txfm_add_wht_wht_4x4_c(pixel *dst, const ptrdiff_t stride,
             dst[x] = iclip_pixel(dst[x] + *c++);
 }
 
-COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
+COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c, int bpc) {
 #define assign_itx_all_fn64(w, h, pfx) \
     c->itxfm_add[pfx##TX_##w##X##h][DCT_DCT  ] = \
         inv_txfm_add_dct_dct_##w##x##h##_c
@@ -224,8 +224,6 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
     c->itxfm_add[pfx##TX_##w##X##h][V_ADST] = \
         inv_txfm_add_identity_adst_##w##x##h##_c; \
 
-    memset(c, 0, sizeof(*c)); /* Zero unused function pointer elements. */
-
     c->itxfm_add[TX_4X4][WHT_WHT] = inv_txfm_add_wht_wht_4x4_c;
     assign_itx_all_fn84( 4,  4, );
     assign_itx_all_fn84( 4,  8, R);
@@ -249,7 +247,7 @@ COLD void bitfn(dav1d_itx_dsp_init)(Dav1dInvTxfmDSPContext *const c) {
 
 #if HAVE_ASM
 #if ARCH_AARCH64 || ARCH_ARM
-    bitfn(dav1d_itx_dsp_init_arm)(c);
+    bitfn(dav1d_itx_dsp_init_arm)(c, bpc);
 #endif
 #if ARCH_X86
     bitfn(dav1d_itx_dsp_init_x86)(c);
diff --git a/ffmpeg/JNI/dav1d/src/lib.c b/ffmpeg/JNI/dav1d/src/lib.c
index cda2e9df0..82af64a53 100644
--- a/ffmpeg/JNI/dav1d/src/lib.c
+++ b/ffmpeg/JNI/dav1d/src/lib.c
@@ -31,7 +31,7 @@
 #include <errno.h>
 #include <string.h>
 
-#ifdef __linux__
+#if defined(__linux__) && defined(HAVE_DLSYM)
 #include <dlfcn.h>
 #endif
 
@@ -81,7 +81,7 @@ static void close_internal(Dav1dContext **const c_out, int flush);
 
 NO_SANITIZE("cfi-icall") // CFI is broken with dlsym()
 static COLD size_t get_stack_size_internal(const pthread_attr_t *const thread_attr) {
-#if defined(__linux__) && defined(HAVE_DLSYM)
+#if defined(__linux__) && defined(HAVE_DLSYM) && defined(__GLIBC__)
     /* glibc has an issue where the size of the TLS is subtracted from the stack
      * size instead of allocated separately. As a result the specified stack
      * size may be insufficient when used in an application with large amounts
diff --git a/ffmpeg/JNI/dav1d/src/log.c b/ffmpeg/JNI/dav1d/src/log.c
index 999e3a2e8..de6776a61 100644
--- a/ffmpeg/JNI/dav1d/src/log.c
+++ b/ffmpeg/JNI/dav1d/src/log.c
@@ -36,13 +36,13 @@
 #include "src/internal.h"
 #include "src/log.h"
 
+#if CONFIG_LOG
 COLD void dav1d_log_default_callback(void *const cookie,
                                      const char *const format, va_list ap)
 {
     vfprintf(stderr, format, ap);
 }
 
-#if CONFIG_LOG
 COLD void dav1d_log(Dav1dContext *const c, const char *const format, ...) {
     validate_input(c != NULL);
 
diff --git a/ffmpeg/JNI/dav1d/src/log.h b/ffmpeg/JNI/dav1d/src/log.h
index 8f6357cb6..df32de7f2 100644
--- a/ffmpeg/JNI/dav1d/src/log.h
+++ b/ffmpeg/JNI/dav1d/src/log.h
@@ -35,12 +35,12 @@
 
 #include "common/attributes.h"
 
-void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
-
 #if CONFIG_LOG
 #define dav1d_log dav1d_log
+void dav1d_log_default_callback(void *cookie, const char *format, va_list ap);
 void dav1d_log(Dav1dContext *c, const char *format, ...) ATTR_FORMAT_PRINTF(2, 3);
 #else
+#define dav1d_log_default_callback NULL
 #define dav1d_log(...) do { } while(0)
 #endif
 
diff --git a/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c b/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c
index 62eee81ed..02413b913 100644
--- a/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/lr_apply_tmpl.c
@@ -73,11 +73,11 @@ static void backup_lpf(const Dav1dFrameContext *const f,
     dst += 4 * PXSTRIDE(dst_stride);
     src += (stripe_h - 2) * PXSTRIDE(src_stride);
 
-    if (f->frame_hdr->super_res.enabled) {
+    if (f->frame_hdr->width[0] != f->frame_hdr->width[1]) {
         while (row + stripe_h <= row_h) {
             const int n_lines = 4 - (row + stripe_h + 1 == h);
             f->dsp->mc.resize(dst, dst_stride, src, src_stride,
-                              dst_w, src_w, n_lines, f->resize_step[ss_hor],
+                              dst_w, n_lines, src_w, f->resize_step[ss_hor],
                               f->resize_start[ss_hor] HIGHBD_CALL_SUFFIX);
             row += stripe_h; // unmodified stripe_h for the 1st stripe
             stripe_h = 64 >> ss_ver;
diff --git a/ffmpeg/JNI/dav1d/src/mc.h b/ffmpeg/JNI/dav1d/src/mc.h
index 33baea6b2..784b58d22 100644
--- a/ffmpeg/JNI/dav1d/src/mc.h
+++ b/ffmpeg/JNI/dav1d/src/mc.h
@@ -110,7 +110,7 @@ typedef decl_emu_edge_fn(*emu_edge_fn);
 #define decl_resize_fn(name) \
 void (name)(pixel *dst, ptrdiff_t dst_stride, \
             const pixel *src, ptrdiff_t src_stride, \
-            int dst_w, int src_w, int h, int dx, int mx HIGHBD_DECL_SUFFIX)
+            int dst_w, int h, int src_w, int dx, int mx HIGHBD_DECL_SUFFIX)
 typedef decl_resize_fn(*resize_fn);
 
 typedef struct Dav1dMCDSPContext {
diff --git a/ffmpeg/JNI/dav1d/src/mc_tmpl.c b/ffmpeg/JNI/dav1d/src/mc_tmpl.c
index 20bef0d7f..c4d9e14eb 100644
--- a/ffmpeg/JNI/dav1d/src/mc_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/mc_tmpl.c
@@ -885,21 +885,21 @@ static void emu_edge_c(const intptr_t bw, const intptr_t bh,
 
 static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
                      const pixel *src, const ptrdiff_t src_stride,
-                     const int dst_w, const int src_w, int h,
+                     const int dst_w, int h, const int src_w,
                      const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
 {
     do {
         int mx = mx0, src_x = -1;
         for (int x = 0; x < dst_w; x++) {
-            const int16_t *const F = dav1d_resize_filter[mx >> 8];
-            dst[x] = iclip_pixel((F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
-                                  F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
-                                  F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
-                                  F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
-                                  F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
-                                  F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
-                                  F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
-                                  F[7] * src[iclip(src_x + 4, 0, src_w - 1)] +
+            const int8_t *const F = dav1d_resize_filter[mx >> 8];
+            dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
+                                    F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
+                                    F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
+                                    F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
+                                    F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
+                                    F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
+                                    F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
+                                    F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
                                   64) >> 7);
             mx += dx;
             src_x += mx >> 14;
diff --git a/ffmpeg/JNI/dav1d/src/meson.build b/ffmpeg/JNI/dav1d/src/meson.build
index d4df49308..fd8ad0269 100644
--- a/ffmpeg/JNI/dav1d/src/meson.build
+++ b/ffmpeg/JNI/dav1d/src/meson.build
@@ -102,6 +102,8 @@ if is_asm_enabled
         )
         if host_machine.cpu_family() == 'aarch64'
             libdav1d_sources += files(
+                # itx.S is used for both 8 and 16 bpc.
+                'arm/64/itx.S',
                 'arm/64/looprestoration_common.S',
                 'arm/64/msac.S',
             )
@@ -110,7 +112,6 @@ if is_asm_enabled
                 libdav1d_sources += files(
                     'arm/64/cdef.S',
                     'arm/64/ipred.S',
-                    'arm/64/itx.S',
                     'arm/64/loopfilter.S',
                     'arm/64/looprestoration.S',
                     'arm/64/mc.S',
@@ -120,6 +121,8 @@ if is_asm_enabled
             if dav1d_bitdepths.contains('16')
                 libdav1d_sources += files(
                     'arm/64/cdef16.S',
+                    'arm/64/ipred16.S',
+                    'arm/64/itx16.S',
                     'arm/64/loopfilter16.S',
                     'arm/64/looprestoration16.S',
                     'arm/64/mc16.S',
@@ -127,12 +130,14 @@ if is_asm_enabled
             endif
         elif host_machine.cpu_family().startswith('arm')
             libdav1d_sources += files(
+                'arm/32/msac.S',
             )
 
             if dav1d_bitdepths.contains('8')
                 libdav1d_sources += files(
                     'arm/32/cdef.S',
                     'arm/32/ipred.S',
+                    'arm/32/itx.S',
                     'arm/32/loopfilter.S',
                     'arm/32/looprestoration.S',
                     'arm/32/mc.S',
@@ -148,14 +153,9 @@ if is_asm_enabled
 
         libdav1d_sources += files(
             'x86/cpu.c',
+            'x86/msac_init.c',
         )
 
-        if host_machine.cpu_family() == 'x86_64'
-            libdav1d_sources += files(
-                'x86/msac_init.c',
-            )
-        endif
-
         libdav1d_tmpl_sources += files(
             'x86/cdef_init_tmpl.c',
             'x86/film_grain_init_tmpl.c',
@@ -174,7 +174,8 @@ if is_asm_enabled
 
         if dav1d_bitdepths.contains('8')
             libdav1d_sources_asm += files(
-                'x86/cdef.asm',
+                'x86/cdef_avx512.asm',
+                'x86/cdef_avx2.asm',
                 'x86/film_grain.asm',
                 'x86/ipred.asm',
                 'x86/itx.asm',
@@ -187,7 +188,7 @@ if is_asm_enabled
                 'x86/itx_ssse3.asm',
                 'x86/loopfilter_ssse3.asm',
                 'x86/looprestoration_ssse3.asm',
-                'x86/mc_ssse3.asm',
+                'x86/mc_sse.asm',
             )
         endif
 
diff --git a/ffmpeg/JNI/dav1d/src/msac.c b/ffmpeg/JNI/dav1d/src/msac.c
index 0a0ef04a1..8195977d5 100644
--- a/ffmpeg/JNI/dav1d/src/msac.c
+++ b/ffmpeg/JNI/dav1d/src/msac.c
@@ -38,7 +38,7 @@
 
 #define EC_WIN_SIZE (sizeof(ec_win) << 3)
 
-static inline void ctx_refill(MsacContext *s) {
+static inline void ctx_refill(MsacContext *const s) {
     const uint8_t *buf_pos = s->buf_pos;
     const uint8_t *buf_end = s->buf_end;
     int c = EC_WIN_SIZE - s->cnt - 24;
@@ -57,7 +57,9 @@ static inline void ctx_refill(MsacContext *s) {
  * necessary), and stores them back in the decoder context.
  * dif: The new value of dif.
  * rng: The new value of the range. */
-static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
+static inline void ctx_norm(MsacContext *const s, const ec_win dif,
+                            const unsigned rng)
+{
     const int d = 15 ^ (31 ^ clz(rng));
     assert(rng <= 65535U);
     s->cnt -= d;
@@ -68,16 +70,16 @@ static inline void ctx_norm(MsacContext *s, ec_win dif, unsigned rng) {
 }
 
 unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) {
-    ec_win vw, dif = s->dif;
-    unsigned ret, v, r = s->rng;
+    const unsigned r = s->rng;
+    ec_win dif = s->dif;
     assert((dif >> (EC_WIN_SIZE - 16)) < r);
     // When the probability is 1/2, f = 16384 >> EC_PROB_SHIFT = 256 and we can
     // replace the multiply with a simple shift.
-    v = ((r >> 8) << 7) + EC_MIN_PROB;
-    vw   = (ec_win)v << (EC_WIN_SIZE - 16);
-    ret  = dif >= vw;
-    dif -= ret*vw;
-    v   += ret*(r - 2*v);
+    unsigned v = ((r >> 8) << 7) + EC_MIN_PROB;
+    const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+    const unsigned ret = dif >= vw;
+    dif -= ret * vw;
+    v += ret * (r - 2 * v);
     ctx_norm(s, dif, v);
     return !ret;
 }
@@ -86,14 +88,14 @@ unsigned dav1d_msac_decode_bool_equi_c(MsacContext *const s) {
  * f: The probability that the bit is one
  * Return: The value decoded (0 or 1). */
 unsigned dav1d_msac_decode_bool_c(MsacContext *const s, const unsigned f) {
-    ec_win vw, dif = s->dif;
-    unsigned ret, v, r = s->rng;
+    const unsigned r = s->rng;
+    ec_win dif = s->dif;
     assert((dif >> (EC_WIN_SIZE - 16)) < r);
-    v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
-    vw   = (ec_win)v << (EC_WIN_SIZE - 16);
-    ret  = dif >= vw;
-    dif -= ret*vw;
-    v   += ret*(r - 2*v);
+    unsigned v = ((r >> 8) * (f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) + EC_MIN_PROB;
+    const ec_win vw = (ec_win)v << (EC_WIN_SIZE - 16);
+    const unsigned ret = dif >= vw;
+    dif -= ret * vw;
+    v += ret * (r - 2 * v);
     ctx_norm(s, dif, v);
     return !ret;
 }
@@ -196,12 +198,11 @@ void dav1d_msac_init(MsacContext *const s, const uint8_t *const data,
     s->rng = 0x8000;
     s->cnt = -15;
     s->allow_update_cdf = !disable_cdf_update_flag;
+    ctx_refill(s);
 
 #if ARCH_X86_64 && HAVE_ASM
     s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt_c;
 
     dav1d_msac_init_x86(s);
 #endif
-
-    ctx_refill(s);
 }
diff --git a/ffmpeg/JNI/dav1d/src/obu.c b/ffmpeg/JNI/dav1d/src/obu.c
index 4406f4bc2..ab9688c25 100644
--- a/ffmpeg/JNI/dav1d/src/obu.c
+++ b/ffmpeg/JNI/dav1d/src/obu.c
@@ -85,7 +85,7 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
             hdr->time_scale = dav1d_get_bits(gb, 32);
             hdr->equal_picture_interval = dav1d_get_bits(gb, 1);
             if (hdr->equal_picture_interval) {
-                unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
+                const unsigned num_ticks_per_picture = dav1d_get_vlc(gb);
                 if (num_ticks_per_picture == 0xFFFFFFFFU)
                     goto error;
                 hdr->num_ticks_per_picture = num_ticks_per_picture + 1;
@@ -111,8 +111,6 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
         for (int i = 0; i < hdr->num_operating_points; i++) {
             struct Dav1dSequenceHeaderOperatingPoint *const op =
                 &hdr->operating_points[i];
-            struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
-                &hdr->operating_parameter_info[i];
             op->idc = dav1d_get_bits(gb, 12);
             op->major_level = 2 + dav1d_get_bits(gb, 3);
             op->minor_level = dav1d_get_bits(gb, 2);
@@ -120,6 +118,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
             op->decoder_model_param_present =
                 hdr->decoder_model_info_present && dav1d_get_bits(gb, 1);
             if (op->decoder_model_param_present) {
+                struct Dav1dSequenceHeaderOperatingParameterInfo *const opi =
+                    &hdr->operating_parameter_info[i];
                 opi->decoder_buffer_delay =
                     dav1d_get_bits(gb, hdr->encoder_decoder_buffer_delay_length);
                 opi->encoder_buffer_delay =
@@ -132,10 +132,9 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
                 op->initial_display_delay = dav1d_get_bits(gb, 4) + 1;
             }
         }
-        if (c->operating_point < hdr->num_operating_points)
-            c->operating_point_idc = hdr->operating_points[c->operating_point].idc;
-        else
-            c->operating_point_idc = hdr->operating_points[0].idc;
+        const int op_idx =
+            c->operating_point < hdr->num_operating_points ? c->operating_point : 0;
+        c->operating_point_idc = hdr->operating_points[op_idx].idc;
 #if DEBUG_SEQ_HDR
         printf("SEQHDR: post-operating-points: off=%ld\n",
                dav1d_get_bits_pos(gb) - init_bit_pos);
@@ -295,7 +294,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
     if (use_ref) {
         for (int i = 0; i < 7; i++) {
             if (dav1d_get_bits(gb, 1)) {
-                Dav1dThreadPicture *const ref =
+                const Dav1dThreadPicture *const ref =
                     &c->refs[c->frame_hdr->refidx[i]].p;
                 if (!ref->p.data[0]) return -1;
                 hdr->width[1] = ref->p.p.w;
@@ -343,7 +342,7 @@ static int read_frame_size(Dav1dContext *const c, GetBits *const gb,
     return 0;
 }
 
-static inline int tile_log2(int sz, int tgt) {
+static inline int tile_log2(const int sz, const int tgt) {
     int k;
     for (k = 0; (sz << k) < tgt; k++) ;
     return k;
@@ -362,7 +361,6 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 #endif
     const Dav1dSequenceHeader *const seqhdr = c->seq_hdr;
     Dav1dFrameHeader *const hdr = c->frame_hdr;
-    int res;
 
     hdr->show_existing_frame =
         !seqhdr->reduced_still_picture_header && dav1d_get_bits(gb, 1);
@@ -444,7 +442,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         if (hdr->refresh_frame_flags != 0xff && hdr->error_resilient_mode && seqhdr->order_hint)
             for (int i = 0; i < 8; i++)
                 dav1d_get_bits(gb, seqhdr->order_hint_n_bits);
-        if ((res = read_frame_size(c, gb, 0)) < 0) goto error;
+        if (read_frame_size(c, gb, 0) < 0) goto error;
         hdr->allow_intrabc = hdr->allow_screen_content_tools &&
                              !hdr->super_res.enabled && dav1d_get_bits(gb, 1);
         hdr->use_ref_frame_mvs = 0;
@@ -479,7 +477,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 
             int latest_frame_offset = -1;
             for (int i = 0; i < 8; i++) {
-                int hint = shifted_frame_offset[i];
+                const int hint = shifted_frame_offset[i];
                 if (!used_frame[i] && hint >= current_frame_offset &&
                     hint >= latest_frame_offset)
                 {
@@ -492,7 +490,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 
             int earliest_frame_offset = INT_MAX;
             for (int i = 0; i < 8; i++) {
-                int hint = shifted_frame_offset[i];
+                const int hint = shifted_frame_offset[i];
                 if (!used_frame[i] && hint >= current_frame_offset &&
                     hint < earliest_frame_offset)
                 {
@@ -505,7 +503,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 
             earliest_frame_offset = INT_MAX;
             for (int i = 0; i < 8; i++) {
-                int hint = shifted_frame_offset[i];
+                const int hint = shifted_frame_offset[i];
                 if (!used_frame[i] && hint >= current_frame_offset &&
                     (hint < earliest_frame_offset))
                 {
@@ -520,7 +518,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
                 if (hdr->refidx[i] < 0) {
                     latest_frame_offset = -1;
                     for (int j = 0; j < 8; j++) {
-                        int hint = shifted_frame_offset[j];
+                        const int hint = shifted_frame_offset[j];
                         if (!used_frame[j] && hint < current_frame_offset &&
                             hint >= latest_frame_offset)
                         {
@@ -536,7 +534,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
             earliest_frame_offset = INT_MAX;
             int ref = -1;
             for (int i = 0; i < 8; i++) {
-                int hint = shifted_frame_offset[i];
+                const int hint = shifted_frame_offset[i];
                 if (hint < earliest_frame_offset) {
                     ref = i;
                     earliest_frame_offset = hint;
@@ -555,7 +553,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         }
         const int use_ref = !hdr->error_resilient_mode &&
                             hdr->frame_size_override;
-        if ((res = read_frame_size(c, gb, use_ref)) < 0) goto error;
+        if (read_frame_size(c, gb, use_ref) < 0) goto error;
         hdr->hp = !hdr->force_integer_mv && dav1d_get_bits(gb, 1);
         hdr->subpel_filter_mode = dav1d_get_bits(gb, 1) ? DAV1D_FILTER_SWITCHABLE :
                                                           dav1d_get_bits(gb, 2);
@@ -579,15 +577,15 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
     // tile data
     hdr->tiling.uniform = dav1d_get_bits(gb, 1);
     const int sbsz_min1 = (64 << seqhdr->sb128) - 1;
-    int sbsz_log2 = 6 + seqhdr->sb128;
-    int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
-    int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
-    int max_tile_width_sb = 4096 >> sbsz_log2;
-    int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
+    const int sbsz_log2 = 6 + seqhdr->sb128;
+    const int sbw = (hdr->width[0] + sbsz_min1) >> sbsz_log2;
+    const int sbh = (hdr->height + sbsz_min1) >> sbsz_log2;
+    const int max_tile_width_sb = 4096 >> sbsz_log2;
+    const int max_tile_area_sb = 4096 * 2304 >> (2 * sbsz_log2);
     hdr->tiling.min_log2_cols = tile_log2(max_tile_width_sb, sbw);
     hdr->tiling.max_log2_cols = tile_log2(1, imin(sbw, DAV1D_MAX_TILE_COLS));
     hdr->tiling.max_log2_rows = tile_log2(1, imin(sbh, DAV1D_MAX_TILE_ROWS));
-    int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
+    const int min_log2_tiles = imax(tile_log2(max_tile_area_sb, sbw * sbh),
                               hdr->tiling.min_log2_cols);
     if (hdr->tiling.uniform) {
         for (hdr->tiling.log2_cols = hdr->tiling.min_log2_cols;
@@ -621,7 +619,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         }
         hdr->tiling.log2_cols = tile_log2(1, hdr->tiling.cols);
         if (min_log2_tiles) max_tile_area_sb >>= min_log2_tiles + 1;
-        int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
+        const int max_tile_height_sb = imax(max_tile_area_sb / widest_tile, 1);
 
         hdr->tiling.rows = 0;
         for (int sby = 0; sby < sbh && hdr->tiling.rows < DAV1D_MAX_TILE_ROWS; hdr->tiling.rows++) {
@@ -657,7 +655,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
         // If the sequence header says that delta_q might be different
         // for U, V, we must check whether it actually is for this
         // frame.
-        int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0;
+        const int diff_uv_delta = seqhdr->separate_uv_delta_q ? dav1d_get_bits(gb, 1) : 0;
         hdr->quant.udc_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
         hdr->quant.uac_delta = dav1d_get_bits(gb, 1) ? dav1d_get_sbits(gb, 6) : 0;
         if (diff_uv_delta) {
@@ -1053,7 +1051,7 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
             for (i = 0; i < 7; i++)
                 if (hdr->refidx[i] == refidx)
                     break;
-            if (i == 7 || !c->refs[refidx].p.p.frame_hdr)  goto error;
+            if (i == 7 || !c->refs[refidx].p.p.frame_hdr) goto error;
             hdr->film_grain.data = c->refs[refidx].p.p.frame_hdr->film_grain.data;
             hdr->film_grain.data.seed = seed;
         } else {
@@ -1133,10 +1131,8 @@ static int parse_frame_hdr(Dav1dContext *const c, GetBits *const gb) {
 }
 
 static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
-    int have_tile_pos = 0;
     const int n_tiles = c->frame_hdr->tiling.cols * c->frame_hdr->tiling.rows;
-    if (n_tiles > 1)
-        have_tile_pos = dav1d_get_bits(gb, 1);
+    const int have_tile_pos = n_tiles > 1 ? dav1d_get_bits(gb, 1) : 0;
 
     if (have_tile_pos) {
         const int n_bits = c->frame_hdr->tiling.log2_cols +
@@ -1151,9 +1147,9 @@ static void parse_tile_hdr(Dav1dContext *const c, GetBits *const gb) {
 
 // Check that we haven't read more than obu_len bytes from the buffer
 // since init_bit_pos.
-static int
-check_for_overrun(Dav1dContext *const c, GetBits *const gb,
-                  unsigned init_bit_pos, unsigned obu_len)
+static int check_for_overrun(Dav1dContext *const c, GetBits *const gb,
+                             const unsigned init_bit_pos,
+                             const unsigned obu_len)
 {
     // Make sure we haven't actually read past the end of the gb buffer
     if (gb->error) {
@@ -1161,7 +1157,7 @@ check_for_overrun(Dav1dContext *const c, GetBits *const gb,
         return 1;
     }
 
-    unsigned pos = dav1d_get_bits_pos(gb);
+    const unsigned pos = dav1d_get_bits_pos(gb);
 
     // We assume that init_bit_pos was the bit position of the buffer
     // at some point in the past, so cannot be smaller than pos.
@@ -1175,7 +1171,7 @@ check_for_overrun(Dav1dContext *const c, GetBits *const gb,
     return 0;
 }
 
-int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
+int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int global) {
     GetBits gb;
     int res;
 
@@ -1196,11 +1192,8 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
     }
 
     // obu length field
-    unsigned len = 0;
-    if (has_length_field)
-        len = dav1d_get_uleb128(&gb);
-    else
-        len = (int) in->sz - 1 - has_extension;
+    const unsigned len = has_length_field ?
+        dav1d_get_uleb128(&gb) : (unsigned) in->sz - 1 - has_extension;
     if (gb.error) goto error;
 
     const unsigned init_bit_pos = dav1d_get_bits_pos(&gb);
@@ -1442,7 +1435,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, int global) {
             payload_size -= meta_type_len;
 
             int country_code_extension_byte = 0;
-            int country_code = dav1d_get_bits(&gb, 8);
+            const int country_code = dav1d_get_bits(&gb, 8);
             payload_size--;
             if (country_code == 0xFF) {
                 country_code_extension_byte = dav1d_get_bits(&gb, 8);
diff --git a/ffmpeg/JNI/dav1d/src/picture.c b/ffmpeg/JNI/dav1d/src/picture.c
index 82197c34d..72af92e94 100644
--- a/ffmpeg/JNI/dav1d/src/picture.c
+++ b/ffmpeg/JNI/dav1d/src/picture.c
@@ -68,7 +68,7 @@ int dav1d_default_picture_alloc(Dav1dPicture *const p, void *const cookie) {
     const size_t y_sz = y_stride * aligned_h;
     const size_t uv_sz = uv_stride * (aligned_h >> ss_ver);
     const size_t pic_size = y_sz + 2 * uv_sz + DAV1D_PICTURE_ALIGNMENT;
-    uint8_t *data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
+    uint8_t *const data = dav1d_alloc_aligned(pic_size, DAV1D_PICTURE_ALIGNMENT);
     if (!data) return DAV1D_ERR(ENOMEM);
 
     p->data[0] = data;
@@ -104,14 +104,16 @@ static void free_buffer(const uint8_t *const data, void *const user_data) {
     free(pic_ctx);
 }
 
-static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p,
+static int picture_alloc_with_edges(Dav1dContext *const c,
+                                    Dav1dPicture *const p,
                                     const int w, const int h,
-                                    Dav1dSequenceHeader *seq_hdr, Dav1dRef *seq_hdr_ref,
-                                    Dav1dFrameHeader *frame_hdr,  Dav1dRef *frame_hdr_ref,
-                                    Dav1dContentLightLevel *content_light, Dav1dRef *content_light_ref,
-                                    Dav1dMasteringDisplay *mastering_display, Dav1dRef *mastering_display_ref,
-                                    Dav1dITUTT35 *itut_t35, Dav1dRef *itut_t35_ref,
-                                    const int bpc, const Dav1dDataProps *props,
+                                    Dav1dSequenceHeader *const seq_hdr, Dav1dRef *const seq_hdr_ref,
+                                    Dav1dFrameHeader *const frame_hdr, Dav1dRef *const frame_hdr_ref,
+                                    Dav1dContentLightLevel *const content_light, Dav1dRef *const content_light_ref,
+                                    Dav1dMasteringDisplay *const mastering_display, Dav1dRef *const mastering_display_ref,
+                                    Dav1dITUTT35 *const itut_t35, Dav1dRef *const itut_t35_ref,
+                                    const int bpc,
+                                    const Dav1dDataProps *const props,
                                     Dav1dPicAllocator *const p_allocator,
                                     const size_t extra, void **const extra_ptr)
 {
@@ -122,9 +124,8 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
     assert(bpc > 0 && bpc <= 16);
 
     struct pic_ctx_context *pic_ctx = malloc(extra + sizeof(struct pic_ctx_context));
-    if (pic_ctx == NULL) {
+    if (pic_ctx == NULL)
         return DAV1D_ERR(ENOMEM);
-    }
 
     p->p.w = w;
     p->p.h = h;
@@ -136,7 +137,7 @@ static int picture_alloc_with_edges(Dav1dContext *const c, Dav1dPicture *const p
     p->p.layout = seq_hdr->layout;
     p->p.bpc = bpc;
     dav1d_data_props_set_defaults(&p->m);
-    int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
+    const int res = p_allocator->alloc_picture_callback(p, p_allocator->cookie);
     if (res < 0) {
         free(pic_ctx);
         return res;
@@ -250,8 +251,8 @@ void dav1d_picture_move_ref(Dav1dPicture *const dst, Dav1dPicture *const src) {
     memset(src, 0, sizeof(*src));
 }
 
-void dav1d_thread_picture_ref(Dav1dThreadPicture *dst,
-                              const Dav1dThreadPicture *src)
+void dav1d_thread_picture_ref(Dav1dThreadPicture *const dst,
+                              const Dav1dThreadPicture *const src)
 {
     dav1d_picture_ref(&dst->p, &src->p);
     dst->t = src->t;
diff --git a/ffmpeg/JNI/dav1d/src/recon_tmpl.c b/ffmpeg/JNI/dav1d/src/recon_tmpl.c
index 9feda96a6..8e96f8e16 100644
--- a/ffmpeg/JNI/dav1d/src/recon_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/recon_tmpl.c
@@ -777,8 +777,8 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTileContext *const t,
     const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
 
     for (int init_y = 0; init_y < h4; init_y += 16) {
+        const int sub_h4 = imin(h4, 16 + init_y);
         for (int init_x = 0; init_x < w4; init_x += 16) {
-            const int sub_h4 = imin(h4, 16 + init_y);
             const int sub_w4 = imin(w4, init_x + 16);
             int y_off = !!init_y, y, x;
             for (y = init_y, t->by += init_y; y < sub_h4;
@@ -932,8 +932,8 @@ static int mc(Dav1dTileContext *const t,
     } else {
         assert(refp != &f->sr_cur);
 
-        int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
-        int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
+        const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
+        const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
 #define scale_mv(res, val, scale) do { \
             const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
             res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
@@ -1071,15 +1071,15 @@ static int warp_affine(Dav1dTileContext *const t,
     const int height = (refp->p.p.h + ss_ver) >> ss_ver;
 
     for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
+        const int src_y = t->by * 4 + ((y + 4) << ss_ver);
+        const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
+        const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
         for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
             // calculate transformation relative to center of 8x8 block in
             // luma pixel units
             const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
-            const int src_y = t->by * 4 + ((y + 4) << ss_ver);
-            const int64_t mvx = ((int64_t) mat[2] * src_x +
-                                 (int64_t) mat[3] * src_y + mat[0]) >> ss_hor;
-            const int64_t mvy = ((int64_t) mat[4] * src_x +
-                                 (int64_t) mat[5] * src_y + mat[1]) >> ss_ver;
+            const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
+            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
 
             const int dx = (int) (mvx >> 16) - 4;
             const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
@@ -1147,6 +1147,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
     const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
 
     for (int init_y = 0; init_y < h4; init_y += 16) {
+        const int sub_h4 = imin(h4, 16 + init_y);
+        const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
         for (int init_x = 0; init_x < w4; init_x += 16) {
             if (b->pal_sz[0]) {
                 pixel *dst = ((pixel *) f->cur.data[0]) +
@@ -1177,7 +1179,6 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
             const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
                               intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
             int y, x;
-            const int sub_h4 = imin(h4, 16 + init_y);
             const int sub_w4 = imin(w4, init_x + 16);
             for (y = init_y, t->by += init_y; y < sub_h4;
                  y += t_dim->h, t->by += t_dim->h)
@@ -1345,8 +1346,8 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
                     hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
                 }
             } else if (b->pal_sz[1]) {
-                ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
-                                           (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
+                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
+                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
                 const uint16_t (*pal)[8];
                 const uint8_t *pal_idx;
                 if (f->frame_thread.pass) {
@@ -1384,7 +1385,6 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
             const int uv_sb_has_bl =
                 init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
                 intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
-            const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
             const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
             for (int pl = 0; pl < 2; pl++) {
                 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
@@ -1520,7 +1520,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTileContext *const t, const enum BlockSize
 }
 
 int bytefn(dav1d_recon_b_inter)(Dav1dTileContext *const t, const enum BlockSize bs,
-                                 const Av1Block *const b)
+                                const Av1Block *const b)
 {
     Dav1dTileState *const ts = t->ts;
     const Dav1dFrameContext *const f = t->f;
@@ -2013,9 +2013,10 @@ void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
             const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
             const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
 
-            f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w, src_w,
-                              imin(img_h, h_end) + h_start, f->resize_step[!!pl],
-                              f->resize_start[!!pl] HIGHBD_CALL_SUFFIX);
+            f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
+                              imin(img_h, h_end) + h_start, src_w,
+                              f->resize_step[!!pl], f->resize_start[!!pl]
+                              HIGHBD_CALL_SUFFIX);
         }
     }
     if (f->lf.restore_planes) {
diff --git a/ffmpeg/JNI/dav1d/src/ref.c b/ffmpeg/JNI/dav1d/src/ref.c
index 89b158047..32cc96f08 100644
--- a/ffmpeg/JNI/dav1d/src/ref.c
+++ b/ffmpeg/JNI/dav1d/src/ref.c
@@ -37,25 +37,21 @@ static void default_free_callback(const uint8_t *const data, void *const user_da
 }
 
 Dav1dRef *dav1d_ref_create(const size_t size) {
-    Dav1dRef *res;
     void *data = dav1d_alloc_aligned(size, 32);
-    if (!data) {
-        return NULL;
-    }
+    if (!data) return NULL;
 
-    res = dav1d_ref_wrap(data, default_free_callback, data);
-    if (!res) {
-        dav1d_free_aligned(data);
-    } else {
+    Dav1dRef *const res = dav1d_ref_wrap(data, default_free_callback, data);
+    if (res)
         res->data = data;
-    }
+    else
+        dav1d_free_aligned(data);
 
     return res;
 }
 
 Dav1dRef *dav1d_ref_wrap(const uint8_t *const ptr,
                          void (*free_callback)(const uint8_t *data, void *user_data),
-                         void *user_data)
+                         void *const user_data)
 {
     Dav1dRef *res = malloc(sizeof(Dav1dRef));
     if (!res) return NULL;
diff --git a/ffmpeg/JNI/dav1d/src/refmvs.c b/ffmpeg/JNI/dav1d/src/refmvs.c
index 2039bed4f..1e113b4ea 100644
--- a/ffmpeg/JNI/dav1d/src/refmvs.c
+++ b/ffmpeg/JNI/dav1d/src/refmvs.c
@@ -182,10 +182,13 @@ static inline union mv mv_projection(const union mv mv, const int num, const int
     };
     assert(den > 0 && den < 32);
     assert(num > -32 && num < 32);
-    const int dm = div_mult[den];
-    const int y = mv.y * num * dm, x = mv.x * num * dm;
-    return (union mv) { .y = (y + 8192 + (y >> 31)) >> 14,
-                        .x = (x + 8192 + (x >> 31)) >> 14 };
+    const int frac = num * div_mult[den];
+    const int y = mv.y * frac, x = mv.x * frac;
+    // Round and clip according to AV1 spec section 7.9.3
+    return (union mv) { // 0x3fff == (1 << 14) - 1
+        .y = iclip((y + 8192 + (y >> 31)) >> 14, -0x3fff, 0x3fff),
+        .x = iclip((x + 8192 + (x >> 31)) >> 14, -0x3fff, 0x3fff)
+    };
 }
 
 static void add_temporal_candidate(const refmvs_frame *const rf,
diff --git a/ffmpeg/JNI/dav1d/src/tables.c b/ffmpeg/JNI/dav1d/src/tables.c
index 629deba8e..30d9fa6ae 100644
--- a/ffmpeg/JNI/dav1d/src/tables.c
+++ b/ffmpeg/JNI/dav1d/src/tables.c
@@ -442,7 +442,7 @@ const uint8_t ALIGN(dav1d_sgr_x_by_x[256], 16) = {
       0
 };
 
-const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
+const int8_t ALIGN(dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8], 8) = {
     [DAV1D_FILTER_8TAP_REGULAR] = {
         {   0,   1,  -3,  63,   4,  -1,   0,   0 },
         {   0,   1,  -5,  61,   9,  -2,   0,   0 },
@@ -524,6 +524,27 @@ const int8_t ALIGN(dav1d_mc_subpel_filters[5][15][8], 8) = {
         {   0,   0,   2,  20,  31,  11,   0,   0 },
         {   0,   0,   2,  18,  31,  13,   0,   0 },
         {   0,   0,   1,  17,  31,  15,   0,   0 }
+#if ARCH_X86_64
+    /* Bilin scaled being very rarely used, add a new table entry
+     * and use the put/prep_8tap_scaled code, thus acting as a
+     * scaled bilinear filter. */
+    }, [5] = {
+        {   0,   0,   0, 60,   4,   0,   0,   0 },
+        {   0,   0,   0, 56,   8,   0,   0,   0 },
+        {   0,   0,   0, 52,  12,   0,   0,   0 },
+        {   0,   0,   0, 48,  16,   0,   0,   0 },
+        {   0,   0,   0, 44,  20,   0,   0,   0 },
+        {   0,   0,   0, 40,  24,   0,   0,   0 },
+        {   0,   0,   0, 36,  28,   0,   0,   0 },
+        {   0,   0,   0, 32,  32,   0,   0,   0 },
+        {   0,   0,   0, 28,  36,   0,   0,   0 },
+        {   0,   0,   0, 24,  40,   0,   0,   0 },
+        {   0,   0,   0, 20,  44,   0,   0,   0 },
+        {   0,   0,   0, 16,  48,   0,   0,   0 },
+        {   0,   0,   0, 12,  52,   0,   0,   0 },
+        {   0,   0,   0,  8,  56,   0,   0,   0 },
+        {   0,   0,   0,  4,  60,   0,   0,   0 }
+#endif
     }
 };
 
@@ -636,39 +657,39 @@ const int8_t ALIGN(dav1d_mc_warp_filter[193][8], 8) = {
     W( 0, 0, 0,   0,   2, 127, - 1, 0 ),
 };
 
-const int16_t dav1d_resize_filter[64][8] = {
-    {  0, 0,   0, 128,   0,   0, 0,  0 }, {  0, 0,  -1, 128,   2,  -1, 0,  0 },
-    {  0, 1,  -3, 127,   4,  -2, 1,  0 }, {  0, 1,  -4, 127,   6,  -3, 1,  0 },
-    {  0, 2,  -6, 126,   8,  -3, 1,  0 }, {  0, 2,  -7, 125,  11,  -4, 1,  0 },
-    { -1, 2,  -8, 125,  13,  -5, 2,  0 }, { -1, 3,  -9, 124,  15,  -6, 2,  0 },
-    { -1, 3, -10, 123,  18,  -6, 2, -1 }, { -1, 3, -11, 122,  20,  -7, 3, -1 },
-    { -1, 4, -12, 121,  22,  -8, 3, -1 }, { -1, 4, -13, 120,  25,  -9, 3, -1 },
-    { -1, 4, -14, 118,  28,  -9, 3, -1 }, { -1, 4, -15, 117,  30, -10, 4, -1 },
-    { -1, 5, -16, 116,  32, -11, 4, -1 }, { -1, 5, -16, 114,  35, -12, 4, -1 },
-    { -1, 5, -17, 112,  38, -12, 4, -1 }, { -1, 5, -18, 111,  40, -13, 5, -1 },
-    { -1, 5, -18, 109,  43, -14, 5, -1 }, { -1, 6, -19, 107,  45, -14, 5, -1 },
-    { -1, 6, -19, 105,  48, -15, 5, -1 }, { -1, 6, -19, 103,  51, -16, 5, -1 },
-    { -1, 6, -20, 101,  53, -16, 6, -1 }, { -1, 6, -20,  99,  56, -17, 6, -1 },
-    { -1, 6, -20,  97,  58, -17, 6, -1 }, { -1, 6, -20,  95,  61, -18, 6, -1 },
-    { -2, 7, -20,  93,  64, -18, 6, -2 }, { -2, 7, -20,  91,  66, -19, 6, -1 },
-    { -2, 7, -20,  88,  69, -19, 6, -1 }, { -2, 7, -20,  86,  71, -19, 6, -1 },
-    { -2, 7, -20,  84,  74, -20, 7, -2 }, { -2, 7, -20,  81,  76, -20, 7, -1 },
-    { -2, 7, -20,  79,  79, -20, 7, -2 }, { -1, 7, -20,  76,  81, -20, 7, -2 },
-    { -2, 7, -20,  74,  84, -20, 7, -2 }, { -1, 6, -19,  71,  86, -20, 7, -2 },
-    { -1, 6, -19,  69,  88, -20, 7, -2 }, { -1, 6, -19,  66,  91, -20, 7, -2 },
-    { -2, 6, -18,  64,  93, -20, 7, -2 }, { -1, 6, -18,  61,  95, -20, 6, -1 },
-    { -1, 6, -17,  58,  97, -20, 6, -1 }, { -1, 6, -17,  56,  99, -20, 6, -1 },
-    { -1, 6, -16,  53, 101, -20, 6, -1 }, { -1, 5, -16,  51, 103, -19, 6, -1 },
-    { -1, 5, -15,  48, 105, -19, 6, -1 }, { -1, 5, -14,  45, 107, -19, 6, -1 },
-    { -1, 5, -14,  43, 109, -18, 5, -1 }, { -1, 5, -13,  40, 111, -18, 5, -1 },
-    { -1, 4, -12,  38, 112, -17, 5, -1 }, { -1, 4, -12,  35, 114, -16, 5, -1 },
-    { -1, 4, -11,  32, 116, -16, 5, -1 }, { -1, 4, -10,  30, 117, -15, 4, -1 },
-    { -1, 3,  -9,  28, 118, -14, 4, -1 }, { -1, 3,  -9,  25, 120, -13, 4, -1 },
-    { -1, 3,  -8,  22, 121, -12, 4, -1 }, { -1, 3,  -7,  20, 122, -11, 3, -1 },
-    { -1, 2,  -6,  18, 123, -10, 3, -1 }, {  0, 2,  -6,  15, 124,  -9, 3, -1 },
-    {  0, 2,  -5,  13, 125,  -8, 2, -1 }, {  0, 1,  -4,  11, 125,  -7, 2,  0 },
-    {  0, 1,  -3,   8, 126,  -6, 2,  0 }, {  0, 1,  -3,   6, 127,  -4, 1,  0 },
-    {  0, 1,  -2,   4, 127,  -3, 1,  0 }, {  0, 0,  -1,   2, 128,  -1, 0,  0 },
+const int8_t ALIGN(dav1d_resize_filter[64][8], 8) = {
+    { 0,  0,  0, -128,    0,  0,  0, 0 }, { 0,  0,  1, -128,   -2,  1,  0, 0 },
+    { 0, -1,  3, -127,   -4,  2, -1, 0 }, { 0, -1,  4, -127,   -6,  3, -1, 0 },
+    { 0, -2,  6, -126,   -8,  3, -1, 0 }, { 0, -2,  7, -125,  -11,  4, -1, 0 },
+    { 1, -2,  8, -125,  -13,  5, -2, 0 }, { 1, -3,  9, -124,  -15,  6, -2, 0 },
+    { 1, -3, 10, -123,  -18,  6, -2, 1 }, { 1, -3, 11, -122,  -20,  7, -3, 1 },
+    { 1, -4, 12, -121,  -22,  8, -3, 1 }, { 1, -4, 13, -120,  -25,  9, -3, 1 },
+    { 1, -4, 14, -118,  -28,  9, -3, 1 }, { 1, -4, 15, -117,  -30, 10, -4, 1 },
+    { 1, -5, 16, -116,  -32, 11, -4, 1 }, { 1, -5, 16, -114,  -35, 12, -4, 1 },
+    { 1, -5, 17, -112,  -38, 12, -4, 1 }, { 1, -5, 18, -111,  -40, 13, -5, 1 },
+    { 1, -5, 18, -109,  -43, 14, -5, 1 }, { 1, -6, 19, -107,  -45, 14, -5, 1 },
+    { 1, -6, 19, -105,  -48, 15, -5, 1 }, { 1, -6, 19, -103,  -51, 16, -5, 1 },
+    { 1, -6, 20, -101,  -53, 16, -6, 1 }, { 1, -6, 20,  -99,  -56, 17, -6, 1 },
+    { 1, -6, 20,  -97,  -58, 17, -6, 1 }, { 1, -6, 20,  -95,  -61, 18, -6, 1 },
+    { 2, -7, 20,  -93,  -64, 18, -6, 2 }, { 2, -7, 20,  -91,  -66, 19, -6, 1 },
+    { 2, -7, 20,  -88,  -69, 19, -6, 1 }, { 2, -7, 20,  -86,  -71, 19, -6, 1 },
+    { 2, -7, 20,  -84,  -74, 20, -7, 2 }, { 2, -7, 20,  -81,  -76, 20, -7, 1 },
+    { 2, -7, 20,  -79,  -79, 20, -7, 2 }, { 1, -7, 20,  -76,  -81, 20, -7, 2 },
+    { 2, -7, 20,  -74,  -84, 20, -7, 2 }, { 1, -6, 19,  -71,  -86, 20, -7, 2 },
+    { 1, -6, 19,  -69,  -88, 20, -7, 2 }, { 1, -6, 19,  -66,  -91, 20, -7, 2 },
+    { 2, -6, 18,  -64,  -93, 20, -7, 2 }, { 1, -6, 18,  -61,  -95, 20, -6, 1 },
+    { 1, -6, 17,  -58,  -97, 20, -6, 1 }, { 1, -6, 17,  -56,  -99, 20, -6, 1 },
+    { 1, -6, 16,  -53, -101, 20, -6, 1 }, { 1, -5, 16,  -51, -103, 19, -6, 1 },
+    { 1, -5, 15,  -48, -105, 19, -6, 1 }, { 1, -5, 14,  -45, -107, 19, -6, 1 },
+    { 1, -5, 14,  -43, -109, 18, -5, 1 }, { 1, -5, 13,  -40, -111, 18, -5, 1 },
+    { 1, -4, 12,  -38, -112, 17, -5, 1 }, { 1, -4, 12,  -35, -114, 16, -5, 1 },
+    { 1, -4, 11,  -32, -116, 16, -5, 1 }, { 1, -4, 10,  -30, -117, 15, -4, 1 },
+    { 1, -3,  9,  -28, -118, 14, -4, 1 }, { 1, -3,  9,  -25, -120, 13, -4, 1 },
+    { 1, -3,  8,  -22, -121, 12, -4, 1 }, { 1, -3,  7,  -20, -122, 11, -3, 1 },
+    { 1, -2,  6,  -18, -123, 10, -3, 1 }, { 0, -2,  6,  -15, -124,  9, -3, 1 },
+    { 0, -2,  5,  -13, -125,  8, -2, 1 }, { 0, -1,  4,  -11, -125,  7, -2, 0 },
+    { 0, -1,  3,   -8, -126,  6, -2, 0 }, { 0, -1,  3,   -6, -127,  4, -1, 0 },
+    { 0, -1,  2,   -4, -127,  3, -1, 0 }, { 0,  0,  1,   -2, -128,  1,  0, 0 },
 };
 
 const uint8_t dav1d_sm_weights[128] = {
diff --git a/ffmpeg/JNI/dav1d/src/tables.h b/ffmpeg/JNI/dav1d/src/tables.h
index 6f8dfd0e1..abcf26592 100644
--- a/ffmpeg/JNI/dav1d/src/tables.h
+++ b/ffmpeg/JNI/dav1d/src/tables.h
@@ -110,9 +110,9 @@ extern const int8_t dav1d_cdef_directions[12][2];
 extern const int16_t dav1d_sgr_params[16][4];
 extern const uint8_t dav1d_sgr_x_by_x[256];
 
-extern const int8_t dav1d_mc_subpel_filters[5][15][8];
+extern const int8_t dav1d_mc_subpel_filters[5+ARCH_X86_64][15][8];
 extern const int8_t dav1d_mc_warp_filter[193][8];
-extern const int16_t dav1d_resize_filter[64][8];
+extern const int8_t dav1d_resize_filter[64][8];
 
 extern const uint8_t dav1d_sm_weights[128];
 extern const uint16_t dav1d_dr_intra_derivative[44];
diff --git a/ffmpeg/JNI/dav1d/src/thread_task.c b/ffmpeg/JNI/dav1d/src/thread_task.c
index e05a18684..6c1c13907 100644
--- a/ffmpeg/JNI/dav1d/src/thread_task.c
+++ b/ffmpeg/JNI/dav1d/src/thread_task.c
@@ -42,8 +42,7 @@ void *dav1d_frame_task(void *const data) {
         if (f->frame_thread.die) break;
         pthread_mutex_unlock(&f->frame_thread.td.lock);
 
-        const int res = dav1d_decode_frame(f);
-        if (res)
+        if (dav1d_decode_frame(f))
             memset(f->frame_thread.cf, 0,
                    (size_t)f->frame_thread.cf_sz * 128 * 128 / 2);
 
@@ -92,8 +91,8 @@ void *dav1d_tile_task(void *const data) {
             for (t->by = ts->tiling.row_start; t->by < ts->tiling.row_end;
                  t->by += f->sb_step)
             {
-                int error = dav1d_decode_tile_sbrow(t);
-                int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
+                const int error = dav1d_decode_tile_sbrow(t);
+                const int progress = error ? TILE_ERROR : 1 + (t->by >> f->sb_shift);
 
                 // signal progress
                 pthread_mutex_lock(&ts->tile_thread.lock);
@@ -128,7 +127,7 @@ void *dav1d_tile_task(void *const data) {
             // waiting for the post-filter to complete
             t->ts = ts;
             t->by = sby << f->sb_shift;
-            int error = dav1d_decode_tile_sbrow(t);
+            const int error = dav1d_decode_tile_sbrow(t);
             progress = error ? TILE_ERROR : 1 + sby;
 
             // signal progress
diff --git a/ffmpeg/JNI/dav1d/src/wedge.c b/ffmpeg/JNI/dav1d/src/wedge.c
index 2c292836e..6b14e9a44 100644
--- a/ffmpeg/JNI/dav1d/src/wedge.c
+++ b/ffmpeg/JNI/dav1d/src/wedge.c
@@ -83,39 +83,39 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
     { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };
 
-static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 32);
-static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 32);
-static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 *  8], 32);
-static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 32);
-static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 32);
-static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 *  8], 32);
-static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 *  8 * 32], 32);
-static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 *  8 * 16], 32);
-static uint8_t ALIGN(wedge_masks_444_8x8  [2 * 16 *  8 *  8], 32);
-
-static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 32);
-static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 32);
-static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 *  8], 32);
-static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 *  8 * 32], 32);
-static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 *  8 * 16], 32);
-static uint8_t ALIGN(wedge_masks_422_8x8  [2 * 16 *  8 *  8], 32);
-static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 *  4 * 32], 32);
-static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 *  4 * 16], 32);
+static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 *  8], 64);
+static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 *  8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_444_8x8  [2 * 16 *  8 *  8], 64);
+
+static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 *  8 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_422_8x8  [2 * 16 *  8 *  8], 64);
+static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 *  4 * 32], 64);
+static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 *  4 * 16], 64);
 static uint8_t ALIGN(wedge_masks_422_4x8  [2 * 16 *  4 *  8], 32);
 
-static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 32);
-static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 *  8], 32);
-static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 *  4], 32);
-static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 *  8 * 16], 32);
-static uint8_t ALIGN(wedge_masks_420_8x8  [2 * 16 *  8 *  8], 32);
-static uint8_t ALIGN(wedge_masks_420_8x4  [2 * 16 *  8 *  4], 32);
-static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 *  4 * 16], 32);
+static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 *  8], 64);
+static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 *  4], 64);
+static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 *  8 * 16], 64);
+static uint8_t ALIGN(wedge_masks_420_8x8  [2 * 16 *  8 *  8], 64);
+static uint8_t ALIGN(wedge_masks_420_8x4  [2 * 16 *  8 *  4], 64);
+static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 *  4 * 16], 64);
 static uint8_t ALIGN(wedge_masks_420_4x8  [2 * 16 *  4 *  8], 32);
-static uint8_t ALIGN(wedge_masks_420_4x4  [2 * 16 *  4 *  4], 32);
+static uint8_t ALIGN(wedge_masks_420_4x4  [2 * 16 *  4 *  4], 16);
 
 const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
 
-static void insert_border(uint8_t *const dst, const uint8_t *src,
+static void insert_border(uint8_t *const dst, const uint8_t *const src,
                           const int ctr)
 {
     if (ctr > 4) memset(dst, 0, ctr - 4);
@@ -156,7 +156,8 @@ static void copy2d(uint8_t *dst, const uint8_t *src,
 }
 
 static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
-                        const int sign, const int w, const int h, const int ss_ver)
+                             const int sign, const int w, const int h,
+                             const int ss_ver)
 {
     for (int y = 0; y < h; y += 1 + ss_ver) {
         for (int x = 0; x < w; x += 2) {
@@ -273,16 +274,16 @@ COLD void dav1d_init_wedge_masks(void) {
 }
 
 #define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
-static uint8_t ALIGN(ii_dc_mask[32 * 32], 32);
-static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 32);
-static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 32);
-static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 32);
-static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 32);
-static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 32);
-static uint8_t ALIGN(ii_nondc_mask_8x8  [N_II_PRED_MODES][ 8 *  8], 32);
-static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 32);
+static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
+static uint8_t ALIGN(ii_nondc_mask_8x8  [N_II_PRED_MODES][ 8 *  8], 64);
+static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
 static uint8_t ALIGN(ii_nondc_mask_4x8  [N_II_PRED_MODES][ 4 *  8], 32);
-static uint8_t ALIGN(ii_nondc_mask_4x4  [N_II_PRED_MODES][ 4 *  4], 32);
+static uint8_t ALIGN(ii_nondc_mask_4x4  [N_II_PRED_MODES][ 4 *  4], 16);
 #undef N_II_PRED_MODES
 
 #define set1(sz) \
diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef.asm b/ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm
similarity index 88%
rename from ffmpeg/JNI/dav1d/src/x86/cdef.asm
rename to ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm
index cd632b133..643caa0cf 100644
--- a/ffmpeg/JNI/dav1d/src/x86/cdef.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/cdef_avx2.asm
@@ -27,22 +27,6 @@
 
 %if ARCH_X86_64
 
-%macro DUP4 1-*
-    %rep %0
-        times 4 db %1
-        %rotate 1
-    %endrep
-%endmacro
-
-%macro DIRS 16 ; cdef_directions[]
-    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
-        ; masking away unused bits allows us to use a single vpaddd {1to16}
-        ; instruction instead of having to do vpbroadcastd + paddb
-        db %13 & 0x3f, -%13 & 0x3f
-        %rotate 1
-    %endrep
-%endmacro
-
 %macro JMP_TABLE 2-*
  %xdefine %1_jmptable %%table
  %xdefine %%base mangle(private_prefix %+ _%1_avx2)
@@ -61,30 +45,9 @@ JMP_TABLE cdef_filter_%1, \
     d0k0, d0k1, d1k0, d1k1
 %endmacro
 
-SECTION_RODATA 64
-
-lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
-               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
-               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
-               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
-edge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
-               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
-               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
-               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
-               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
-               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
-               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
-               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
-px_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
-cdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
-gf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
-               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
-               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
-               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
-end_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
-pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
-sec_tap:       db 32, 32, 16, 16
-pd_268435568:  dd 268435568
+SECTION_RODATA 32
+
+pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
 blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
                dd 0x80, 0x00, 0x00
 blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
@@ -96,7 +59,6 @@ blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
                dd 0x0000, 0x0000
 blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
 blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
-pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
 div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
 shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
 shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
@@ -497,14 +459,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
     movifnidn     prid, prim
     sub       dampingd, 31
     movifnidn  secdmpd, secdmpm
-    or            prid, 0
+    test          prid, prid
     jz .sec_only
     movd           xm0, prid
     lzcnt      pridmpd, prid
     add        pridmpd, dampingd
     cmovs      pridmpd, zerod
     mov        [rsp+0], pridmpq                 ; pri_shift
-    or         secdmpd, 0
+    test       secdmpd, secdmpd
     jz .pri_only
     movd           xm1, secdmpd
     lzcnt      secdmpd, secdmpd
@@ -725,7 +687,7 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
     movu           xm9, [dstq+strideq*1-1]
     vinserti128     m5, [dstq+strideq*2-1], 1
     vinserti128     m9, [dstq+stride3q -1], 1
-    mova           m10, [blend_8x8_0+16]
+    movu           m10, [blend_8x8_0+16]
     punpcklqdq      m6, m5, m9
     vpblendvb       m6, [rsp+gprsize+80+hq*8+64], m10
     psrldq          m5, 2
@@ -1506,14 +1468,14 @@ cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
     movifnidn     prid, prim
     sub       dampingd, 31
     movifnidn  secdmpd, secdmpm
-    or            prid, 0
+    test          prid, prid
     jz .border_sec_only
     movd           xm0, prid
     lzcnt      pridmpd, prid
     add        pridmpd, dampingd
     cmovs      pridmpd, zerod
     mov        [rsp+0], pridmpq                 ; pri_shift
-    or         secdmpd, 0
+    test       secdmpd, secdmpd
     jz .border_pri_only
     movd           xm1, secdmpd
     lzcnt      secdmpd, secdmpd
@@ -1833,169 +1795,4 @@ cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
     movd        [varq], xm2
     RET
 
-%if WIN64
-DECLARE_REG_TMP 5, 6
-%else
-DECLARE_REG_TMP 8, 5
-%endif
-
-; lut:
-; t0 t1 t2 t3 t4 t5 t6 t7
-; T0 T1 T2 T3 T4 T5 T6 T7
-; L0 L1 00 01 02 03 04 05
-; L2 L3 10 11 12 13 14 15
-; L4 L5 20 21 22 23 24 25
-; L6 L7 30 31 32 33 34 35
-; 4e 4f 40 41 42 43 44 45
-; 5e 5f 50 51 52 53 54 55
-
-INIT_ZMM avx512icl
-cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
-%define base r7-edge_mask
-    movq         xmm0, [dstq+strideq*0]
-    movhps       xmm0, [dstq+strideq*1]
-    lea            r7, [edge_mask]
-    movq         xmm1, [topq+strideq*0-2]
-    movhps       xmm1, [topq+strideq*1-2]
-    mov           r6d, edgem
-    vinserti32x4  ym0, ymm0, [leftq], 1
-    lea            r2, [strideq*3]
-    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
-    mova           m5, [base+lut_perm_4x4]
-    vinserti32x4   m0, [dstq+r2], 2
-    test          r6b, 0x08      ; avoid buffer overread
-    jz .main
-    lea            r3, [dstq+strideq*4-4]
-    vinserti32x4   m1, [r3+strideq*0], 2
-    vinserti32x4   m0, [r3+strideq*1], 3
-.main:
-    movifnidn    prid, prim
-    mov           t0d, dirm
-    mova           m3, [base+px_idx]
-    mov           r3d, dampingm
-    vpermi2b       m5, m0, m1    ; lut
-    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
-    pxor           m7, m7
-    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
-    vpermb         m6, m3, m5    ; px
-    cmp           r6d, 0x0f
-    jne .mask_edges              ; mask edges only if required
-    test         prid, prid
-    jz .sec_only
-    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
-    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
-%macro CDEF_FILTER_4x4_PRI 0
-    vpcmpub        k1, m6, m1, 6 ; px > pN
-    psubb          m2, m1, m6
-    lzcnt         r6d, prid
-    vpsubb     m2{k1}, m6, m1    ; abs(diff)
-    vpbroadcastb   m4, prim
-    and          prid, 1
-    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
-    movifnidn     t1d, secm
-    vpbroadcastd  m10, [base+pri_tap+priq*4]
-    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
-    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
-    pminub         m2, m4
-    vpdpbusd       m0, m2, m10   ; sum
-%endmacro
-    CDEF_FILTER_4x4_PRI
-    test          t1d, t1d       ; sec
-    jz .end_no_clip
-    call .sec
-.end_clip:
-    pminub         m4, m6, m1
-    pmaxub         m1, m6
-    pminub         m5, m2, m3
-    pmaxub         m2, m3
-    pminub         m4, m5
-    pmaxub         m2, m1
-    psrldq         m1, m4, 2
-    psrldq         m3, m2, 2
-    pminub         m1, m4
-    vpcmpw         k1, m0, m7, 1
-    vpshldd        m6, m0, 8
-    pmaxub         m2, m3
-    pslldq         m3, m1, 1
-    psubw          m7, m0
-    paddusw        m0, m6     ; clip >0xff
-    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
-    pslldq         m4, m2, 1
-    pminub         m1, m3
-    pmaxub         m2, m4
-    pmaxub         m0, m1
-    pminub         m0, m2
-    jmp .end
-.sec_only:
-    movifnidn     t1d, secm
-    call .sec
-.end_no_clip:
-    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
-    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
-.end:
-    mova          xm1, [base+end_perm]
-    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
-    movd   [dstq+strideq*0], xm0
-    pextrd [dstq+strideq*1], xm0, 1
-    pextrd [dstq+strideq*2], xm0, 2
-    pextrd [dstq+r2       ], xm0, 3
-    RET
-.mask_edges_sec_only:
-    movifnidn     t1d, secm
-    call .mask_edges_sec
-    jmp .end_no_clip
-ALIGN function_align
-.mask_edges:
-    vpbroadcastq   m8, [base+edge_mask+r6*8]
-    test         prid, prid
-    jz .mask_edges_sec_only
-    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
-    vpshufbitqmb   k1, m8, m2 ; index in-range
-    mova           m1, m6
-    vpermb     m1{k1}, m2, m5
-    CDEF_FILTER_4x4_PRI
-    test          t1d, t1d
-    jz .end_no_clip
-    call .mask_edges_sec
-    jmp .end_clip
-.mask_edges_sec:
-    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
-    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
-    vpshufbitqmb   k1, m8, m4
-    mova           m2, m6
-    vpermb     m2{k1}, m4, m5
-    vpshufbitqmb   k1, m8, m9
-    mova           m3, m6
-    vpermb     m3{k1}, m9, m5
-    jmp .sec_main
-ALIGN function_align
-.sec:
-    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
-    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
-    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
-    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
-.sec_main:
-    vpbroadcastd   m8, [base+sec_tap]
-    vpcmpub        k1, m6, m2, 6
-    psubb          m4, m2, m6
-    vpbroadcastb  m12, t1d
-    lzcnt         t1d, t1d
-    vpsubb     m4{k1}, m6, m2
-    vpcmpub        k2, m6, m3, 6
-    vpbroadcastq  m11, [r3+t1*8]
-    gf2p8affineqb m10, m4, m11, 0
-    psubb          m5, m3, m6
-    mova           m9, m8
-    vpsubb     m8{k1}, m7, m8
-    psubusb       m10, m12, m10
-    vpsubb     m5{k2}, m6, m3
-    pminub         m4, m10
-    vpdpbusd       m0, m4, m8
-    gf2p8affineqb m11, m5, m11, 0
-    vpsubb     m9{k2}, m7, m9
-    psubusb       m12, m11
-    pminub         m5, m12
-    vpdpbusd       m0, m5, m9
-    ret
-
 %endif ; ARCH_X86_64
diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm b/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm
new file mode 100644
index 000000000..e7eee9ebf
--- /dev/null
+++ b/ffmpeg/JNI/dav1d/src/x86/cdef_avx512.asm
@@ -0,0 +1,867 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if HAVE_AVX512ICL && ARCH_X86_64
+
+%macro DUP4 1-*
+    %rep %0
+        times 4 db %1
+        %rotate 1
+    %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+    %rep 4 + 16 + 4 ; 6 7   0 1 2 3 4 5 6 7   0 1
+        ; masking away unused bits allows us to use a single vpaddd {1to16}
+        ; instruction instead of having to do vpbroadcastd + paddb
+        db %13 & 0x3f, -%13 & 0x3f
+        %rotate 1
+    %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+               db 16, 17,  0,  1,  2,  3,  4,  5, 18, 19,  8,  9, 10, 11, 12, 13
+               db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+               db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+              db  96, 97,  0,  1,  2,  3,  4,  5, 98, 99,  8,  9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+              db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+              db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+               db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567:   dd  0,  1,  2,  3,  4,  5,  6,  7
+lut_perm_8x8a: db  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
+               db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
+               db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
+               db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
+lut_perm_8x8b: db  4,  5,  6,  7,  8,  9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
+               db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+               db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
+              db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
+edge_mask:     dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+               dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+               dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+               dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+               dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+               dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+               dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+               dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx:      DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs:   DIRS -7,-14,  1, -6,  1,  2,  1, 10,  9, 18,  8, 17,  8, 16,  8, 15
+gf_shr:        dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+               dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+               dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+               dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+      times 16 db  0 ; realign (introduced by cdef_dirs)
+end_perm_w8clip:db 0, 4,  8, 12,  2,  6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+               db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+               db  1,  5,  9, 13,  3,  7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+               db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+end_perm:      db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+               db  3,  7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
+sec_tap:       db 32, 32, 16, 16
+pd_268435568:  dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 5, 6
+%else
+DECLARE_REG_TMP 8, 5
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; 4e 4f 40 41 42 43 44 45
+; 5e 5f 50 51 52 53 54 55
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
+%define base r7-edge_mask
+    movq         xmm0, [dstq+strideq*0]
+    movhps       xmm0, [dstq+strideq*1]
+    lea            r7, [edge_mask]
+    movq         xmm1, [topq+strideq*0-2]
+    movhps       xmm1, [topq+strideq*1-2]
+    mov           r6d, edgem
+    vinserti32x4  ym0, ymm0, [leftq], 1
+    lea            r2, [strideq*3]
+    vinserti32x4  ym1, ymm1, [dstq+strideq*2], 1
+    mova           m5, [base+lut_perm_4x4]
+    vinserti32x4   m0, [dstq+r2], 2
+    test          r6b, 0x08      ; avoid buffer overread
+    jz .main
+    lea            r3, [dstq+strideq*4-4]
+    vinserti32x4   m1, [r3+strideq*0], 2
+    vinserti32x4   m0, [r3+strideq*1], 3
+.main:
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova           m3, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b       m5, m0, m1    ; lut
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor           m7, m7
+    lea            r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+    vpermb         m6, m3, m5    ; px
+    cmp           r6d, 0x0f
+    jne .mask_edges              ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd         m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m1, m1, m5    ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+    vpcmpub        k1, m6, m1, 6 ; px > pN
+    psubb          m2, m1, m6
+    lzcnt         r6d, prid
+    vpsubb     m2{k1}, m6, m1    ; abs(diff)
+    vpbroadcastb   m4, prid
+    and          prid, 1
+    vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+    movifnidn     t1d, secm
+    vpbroadcastd  m10, [base+pri_tap+priq*4]
+    vpsubb    m10{k1}, m7, m10   ; apply_sign(pri_tap)
+    psubusb        m4, m9        ; imax(0, pri_strength - (abs(diff) >> shift)))
+    pminub         m2, m4
+    vpdpbusd       m0, m2, m10   ; sum
+%endmacro
+    CDEF_FILTER_4x4_PRI
+    test          t1d, t1d       ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub         m4, m6, m1
+    pmaxub         m1, m6
+    pminub         m5, m2, m3
+    pmaxub         m2, m3
+    pminub         m4, m5
+    pmaxub         m2, m1
+    psrldq         m1, m4, 2
+    psrldq         m3, m2, 2
+    pminub         m1, m4
+    vpcmpw         k1, m0, m7, 1
+    vpshldd        m6, m0, 8
+    pmaxub         m2, m3
+    pslldq         m3, m1, 1
+    psubw          m7, m0
+    paddusw        m0, m6     ; clip >0xff
+    vpsubusw   m0{k1}, m6, m7 ; clip <0x00
+    pslldq         m4, m2, 1
+    pminub         m1, m3
+    pmaxub         m2, m4
+    pmaxub         m0, m1
+    pminub         m0, m2
+    jmp .end
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    vpshldd        m6, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    paddw          m0, m6     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+    mova          xm1, [base+end_perm]
+    vpermb         m0, m1, m0 ; output in bits 8-15 of each dword
+    movd   [dstq+strideq*0], xm0
+    pextrd [dstq+strideq*1], xm0, 1
+    pextrd [dstq+strideq*2], xm0, 2
+    pextrd [dstq+r2       ], xm0, 3
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    vpbroadcastq   m8, [base+edge_mask+r6*8]
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd         m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m8, m2 ; index in-range
+    mova           m1, m6
+    vpermb     m1{k1}, m2, m5
+    CDEF_FILTER_4x4_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd         m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd         m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m8, m4
+    mova           m2, m6
+    vpermb     m2{k1}, m4, m5
+    vpshufbitqmb   k1, m8, m9
+    mova           m3, m6
+    vpermb     m3{k1}, m9, m5
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd         m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd         m3,     [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb         m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+    vpermb         m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+    vpbroadcastd   m8, [base+sec_tap]
+    vpcmpub        k1, m6, m2, 6
+    psubb          m4, m2, m6
+    vpbroadcastb  m12, t1d
+    lzcnt         t1d, t1d
+    vpsubb     m4{k1}, m6, m2
+    vpcmpub        k2, m6, m3, 6
+    vpbroadcastq  m11, [r3+t1*8]
+    gf2p8affineqb m10, m4, m11, 0
+    psubb          m5, m3, m6
+    mova           m9, m8
+    vpsubb     m8{k1}, m7, m8
+    psubusb       m10, m12, m10
+    vpsubb     m5{k2}, m6, m3
+    pminub         m4, m10
+    vpdpbusd       m0, m4, m8
+    gf2p8affineqb m11, m5, m11, 0
+    vpsubb     m9{k2}, m7, m9
+    psubusb       m12, m11
+    pminub         m5, m12
+    vpdpbusd       m0, m5, m9
+    ret
+
+DECLARE_REG_TMP 2, 7
+
+;         lut top                lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7  L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7  L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05  L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15  La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25  Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35  Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45  8e 8f 80 81 82 83 84 85
+; La Lb 50 51 52 53 54 55  9e 9f 90 91 92 93 94 95
+
+cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
+                                   pri, sec, dir, damping, edge
+%define base r8-edge_mask
+    vpbroadcastd ym21, strided
+    mov           r6d, edgem
+    lea            r8, [edge_mask]
+    movq          xm1, [topq+strideq*0-2]
+    pmulld       ym21, [base+pd_01234567]
+    kxnorb         k1, k1, k1
+    movq          xm2, [topq+strideq*1-2]
+    vpgatherdq m0{k1}, [dstq+ym21]  ; +0+1 +2+3 +4+5 +6+7
+    mova          m14, [base+lut_perm_4x8a]
+    movu          m15, [base+lut_perm_4x8b]
+    test          r6b, 0x08         ; avoid buffer overread
+    jz .main
+    lea            r7, [dstq+strideq*8-2]
+    vinserti32x4  ym1, [r7+strideq*0], 1
+    vinserti32x4  ym2, [r7+strideq*1], 1
+.main:
+    punpcklqdq    ym1, ym2
+    vinserti32x4   m1, [leftq], 2   ; -2-1 +8+9 left ____
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova          m16, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b      m14, m0, m1    ; lut top
+    vpermi2b      m15, m0, m1    ; lut bottom
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor          m20, m20
+    lea            r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+    vpermb         m2, m16, m14  ; pxt
+    vpermb         m3, m16, m15  ; pxb
+    mova           m1, m0
+    cmp           r6b, 0x0f
+    jne .mask_edges              ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m4, m6, m14   ; pNt k0p0 k0p1 k1p0 k1p1
+    vpermb         m5, m6, m15   ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+    vpcmpub        k1, m2, m4, 6 ; pxt > pNt
+    vpcmpub        k2, m3, m5, 6 ; pxb > pNb
+    psubb          m6, m4, m2
+    psubb          m7, m5, m3
+    lzcnt         r6d, prid
+    vpsubb     m6{k1}, m2, m4    ; abs(diff_top)
+    vpsubb     m7{k2}, m3, m5    ; abs(diff_bottom)
+    vpbroadcastb  m13, prid
+    vpbroadcastq   m9, [r3+r6*8]
+    and          prid, 1
+    vpbroadcastd  m11, [base+pri_tap+priq*4]
+    vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+    vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+    mova          m10, m11
+    movifnidn     t1d, secm
+    vpsubb    m10{k1}, m20, m11  ; apply_sign(pri_tap_top)
+    vpsubb    m11{k2}, m20, m11  ; apply_sign(pri_tap_bottom)
+    psubusb       m12, m13, m8   ; imax(0, pri_strength - (abs(dt) >> shift)))
+    psubusb       m13, m13, m9   ; imax(0, pri_strength - (abs(db) >> shift)))
+    pminub         m6, m12
+    pminub         m7, m13
+    vpdpbusd       m0, m6, m10   ; sum top
+    vpdpbusd       m1, m7, m11   ; sum bottom
+%endmacro
+    CDEF_FILTER_4x8_PRI
+    test          t1d, t1d       ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub        m10, m4, m2
+    pminub        m12, m6, m8
+    pminub        m11, m5, m3
+    pminub        m13, m7, m9
+    pmaxub         m4, m2
+    pmaxub         m6, m8
+    pmaxub         m5, m3
+    pmaxub         m7, m9
+    pminub        m10, m12
+    pminub        m11, m13
+    pmaxub         m4, m6
+    pmaxub         m5, m7
+    mov           r2d, 0xAAAAAAAA
+    kmovd          k1, r2d
+    kxnorb         k2, k2, k2       ;   hw   lw
+    vpshrdd       m12, m0, m1, 16   ;  m1lw m0hw
+    vpshrdd        m6, m10, m11, 16 ; m11lw m10hw
+    vpshrdd        m8, m4, m5, 16   ;  m5lw m4hw
+    vpblendmw  m7{k1}, m10, m11     ; m11hw m10lw
+    vpblendmw  m9{k1}, m4, m5       ;  m5hw m4lw
+    vpblendmw  m4{k1}, m0, m12      ;  m1lw m0lw
+    vpblendmw  m5{k1}, m12, m1      ;  m1hw m0hw
+    vpshrdd        m2, m3, 16
+    pminub         m6, m7
+    pmaxub         m8, m9
+    mova         ym14, [base+end_perm]
+    vpcmpw         k1, m4, m20, 1
+    vpshldw        m2, m5, 8
+    pslldq         m7, m6, 1
+    pslldq         m9, m8, 1
+    psubw          m5, m20, m4
+    paddusw        m0, m4, m2 ; clip >0xff
+    pminub         m6, m7
+    pmaxub         m8, m9
+    psubusw    m0{k1}, m2, m5 ; clip <0x00
+    pmaxub         m0, m6
+    pminub         m0, m8
+    vpermb         m0, m14, m0
+    vpscatterdd [dstq+ym21]{k2}, ym0
+    RET
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    mova          ym4, [base+end_perm]
+    kxnorb         k1, k1, k1
+    vpshldd        m2, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    vpshldd        m3, m1, 8
+    paddw          m0, m2     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+    paddw          m1, m3
+    pslld          m0, 16
+    vpshrdd        m0, m1, 16
+    vpermb         m0, m4, m0 ; output in bits 8-15 of each word
+    vpscatterdd [dstq+ym21]{k1}, ym0
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    mov           t1d, r6d
+    or            r6d, 8 ; top 4x4 has bottom
+    or            t1d, 4 ; bottom 4x4 has top
+    vpbroadcastq  m17, [base+edge_mask+r6*8]
+    vpbroadcastq  m18, [base+edge_mask+t1*8]
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd         m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m17, m6 ; index in-range
+    vpshufbitqmb   k2, m18, m6
+    mova           m4, m2
+    mova           m5, m3
+    vpermb     m4{k1}, m6, m14
+    vpermb     m5{k2}, m6, m15
+    CDEF_FILTER_4x8_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd        m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd        m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m17, m10
+    vpshufbitqmb   k2, m18, m10
+    vpshufbitqmb   k3, m17, m11
+    vpshufbitqmb   k4, m18, m11
+    mova           m6, m2
+    mova           m7, m3
+    mova           m8, m2
+    mova           m9, m3
+    vpermb     m6{k1}, m10, m14
+    vpermb     m7{k2}, m10, m15
+    vpermb     m8{k3}, m11, m14
+    vpermb     m9{k4}, m11, m15
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd         m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd         m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb         m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+    vpermb         m7, m8, m15 ; pNb
+    vpermb         m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+    vpermb         m9, m9, m15 ; pNb
+.sec_main:
+    vpbroadcastb  m18, t1d
+    lzcnt         t1d, t1d
+    vpcmpub        k1, m2, m6, 6
+    vpcmpub        k2, m3, m7, 6
+    vpcmpub        k3, m2, m8, 6
+    vpcmpub        k4, m3, m9, 6
+    vpbroadcastq  m17, [r3+t1*8]
+    psubb         m10, m6, m2
+    psubb         m11, m7, m3
+    psubb         m12, m8, m2
+    psubb         m13, m9, m3
+    vpsubb    m10{k1}, m2, m6      ; abs(dt0)
+    vpsubb    m11{k2}, m3, m7      ; abs(db0)
+    vpsubb    m12{k3}, m2, m8      ; abs(dt1)
+    vpsubb    m13{k4}, m3, m9      ; abs(db1)
+    vpbroadcastd  m19, [base+sec_tap]
+    gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+    gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+    gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+    gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+    psubusb       m14, m18, m14    ; imax(0, sec_strength - (abs(dt0) >> shift)))
+    psubusb       m15, m18, m15    ; imax(0, sec_strength - (abs(db0) >> shift)))
+    psubusb       m16, m18, m16    ; imax(0, sec_strength - (abs(dt1) >> shift)))
+    psubusb       m17, m18, m17    ; imax(0, sec_strength - (abs(db1) >> shift)))
+    pminub        m10, m14
+    pminub        m11, m15
+    pminub        m12, m16
+    pminub        m13, m17
+    mova          m14, m19
+    mova          m15, m19
+    mova          m16, m19
+    vpsubb    m14{k1}, m20, m19    ; apply_sign(sec_tap_top_0)
+    vpsubb    m15{k2}, m20, m19    ; apply_sign(sec_tap_bottom_0)
+    vpsubb    m16{k3}, m20, m19    ; apply_sign(sec_tap_top_1)
+    vpsubb    m19{k4}, m20, m19    ; apply_sign(sec_tap_bottom_1)
+    vpdpbusd       m0, m10, m14
+    vpdpbusd       m1, m11, m15
+    vpdpbusd       m0, m12, m16
+    vpdpbusd       m1, m13, m19
+    ret
+
+;         lut tl                   lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7  t6 t7 t8 t9 ta tb tc td
+; T0 T1 T2 T3 T4 T5 T6 T7  T6 T7 T8 T9 TA TB TC TD
+; L0 L1 00 01 02 03 04 05  04 05 06 07 08 09 0a 0b
+; L2 L3 10 11 12 13 14 15  14 15 16 17 18 19 1a 1b
+; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
+;         lut bl                   lut br
+; L4 L5 20 21 22 23 24 25  24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35  34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45  44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55  54 55 56 57 58 59 5a 5b
+; Lc Ld 60 61 62 63 64 65  64 65 66 67 68 69 6a 6b
+; Le Lf 70 71 72 73 74 75  74 75 76 77 78 79 7a 7b
+; 8e 8f 80 81 82 83 84 85  84 85 86 87 88 89 8a 8b
+; 9e 9f 90 91 92 93 94 95  94 95 96 97 98 99 9a 9b
+
+cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
+                                          pri, sec, dir, damping, edge
+%define base r8-edge_mask
+    mov           r6d, edgem
+    lea           r10, [dstq+strideq*4-2]
+    movu         xmm0, [topq+strideq*0-2]
+    movu         xmm1, [dstq+strideq*2-2]
+    movu         xmm2, [r10 +strideq*2  ]
+    lea            r8, [edge_mask]
+    lea            r9, [strideq*3]
+    pmovzxwq      m10, [leftq-4]
+    vinserti32x4  ym0, ymm0, [topq+strideq*1-2], 1
+    vinserti32x4  ym1, ymm1, [dstq+r9       -2], 1
+    vinserti32x4  ym2, ymm2, [r10 +r9         ], 1
+    lea            r7, [r10 +strideq*4  ]
+    pmovzxwq      m11, [leftq+4]
+    vinserti32x4   m0, [dstq+strideq*0-2], 2
+    vinserti32x4   m1, [r10 +strideq*0  ], 2
+    mova          m12, [base+lut_perm_8x8a]
+    movu          m13, [base+lut_perm_8x8b]
+    vinserti32x4   m0, [dstq+strideq*1-2], 3
+    vinserti32x4   m1, [r10 +strideq*1  ], 3
+    test          r6b, 0x08       ; avoid buffer overread
+    jz .main
+    vinserti32x4   m2, [r7  +strideq*0], 2
+    vinserti32x4   m2, [r7  +strideq*1], 3
+.main:
+    mov           t1d, 0x11111100
+    mova          m14, m12
+    mova          m15, m13
+    kmovd          k1, t1d
+    kshiftrd       k2, k1, 8
+    movifnidn    prid, prim
+    mov           t0d, dirm
+    mova          m30, [base+px_idx]
+    mov           r3d, dampingm
+    vpermi2b      m12, m0, m1     ; lut tl
+    vpermi2b      m14, m1, m2     ; lut bl
+    vpermi2b      m13, m0, m1     ; lut tr
+    vpermi2b      m15, m1, m2     ; lut br
+    vpblendmw m12{k1}, m12, m10
+    vpblendmw m14{k2}, m14, m11
+    vpbroadcastd   m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+    pxor          m31, m31
+    lea            r3, [r8+r3*8]  ; gf_shr + (damping - 30) * 8
+    vpermb         m4, m30, m12   ; pxtl
+    vpermb         m5, m30, m13   ; pxtr
+    vpermb         m6, m30, m14   ; pxbl
+    vpermb         m7, m30, m15   ; pxbr
+    mova           m1, m0
+    mova           m2, m0
+    mova           m3, m0
+    cmp           r6b, 0x0f
+    jne .mask_edges               ; mask edges only if required
+    test         prid, prid
+    jz .sec_only
+    vpaddd        m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+    vpermb         m8, m11, m12   ; pNtl k0p0 k0p1 k1p0 k1p1
+    vpermb         m9, m11, m13   ; pNtr
+    vpermb        m10, m11, m14   ; pNbl
+    vpermb        m11, m11, m15   ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+    vpcmpub        k1, m4, m8, 6  ; pxtl > pNtl
+    vpcmpub        k2, m5, m9, 6  ; pxtr > pNtr
+    vpcmpub        k3, m6, m10, 6 ; pxbl > pNbl
+    vpcmpub        k4, m7, m11, 6 ; pxbr > pNbr
+    psubb         m16, m8, m4
+    psubb         m17, m9, m5
+    psubb         m18, m10, m6
+    psubb         m19, m11, m7
+    lzcnt         r6d, prid
+    vpsubb    m16{k1}, m4, m8     ; abs(diff_tl)
+    vpsubb    m17{k2}, m5, m9     ; abs(diff_tr)
+    vpsubb    m18{k3}, m6, m10    ; abs(diff_bl)
+    vpsubb    m19{k4}, m7, m11    ; abs(diff_br)
+    vpbroadcastq  m28, [r3+r6*8]
+    vpbroadcastb  m29, prid
+    and          prid, 1
+    vpbroadcastd  m27, [base+pri_tap+priq*4]
+    vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+    vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+    vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+    vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+    mova          m24, m27
+    mova          m25, m27
+    mova          m26, m27
+    movifnidn     t1d, secm
+    vpsubb    m24{k1}, m31, m27   ; apply_sign(pri_tap_tl)
+    vpsubb    m25{k2}, m31, m27   ; apply_sign(pri_tap_tr)
+    vpsubb    m26{k3}, m31, m27   ; apply_sign(pri_tap_tl)
+    vpsubb    m27{k4}, m31, m27   ; apply_sign(pri_tap_tr)
+    psubusb       m20, m29, m20   ; imax(0, pri_strength - (abs(dtl) >> shift)))
+    psubusb       m21, m29, m21   ; imax(0, pri_strength - (abs(dtr) >> shift)))
+    psubusb       m22, m29, m22   ; imax(0, pri_strength - (abs(dbl) >> shift)))
+    psubusb       m23, m29, m23   ; imax(0, pri_strength - (abs(dbr) >> shift)))
+    pminub        m16, m20
+    pminub        m17, m21
+    pminub        m18, m22
+    pminub        m19, m23
+    vpdpbusd       m0, m16, m24   ; sum tl
+    vpdpbusd       m1, m17, m25   ; sum tr
+    vpdpbusd       m2, m18, m26   ; sum bl
+    vpdpbusd       m3, m19, m27   ; sum br
+%endmacro
+    CDEF_FILTER_8x8_PRI
+    test          t1d, t1d        ; sec
+    jz .end_no_clip
+    call .sec
+.end_clip:
+    pminub        m20, m8, m4
+    pminub        m24, m12, m16
+    pminub        m21, m9, m5
+    pminub        m25, m13, m17
+    pminub        m22, m10, m6
+    pminub        m26, m14, m18
+    pminub        m23, m11, m7
+    pminub        m27, m15, m19
+    pmaxub         m8, m4
+    pmaxub        m12, m16
+    pmaxub         m9, m5
+    pmaxub        m13, m17
+    pmaxub        m10, m6
+    pmaxub        m14, m18
+    pmaxub        m11, m7
+    pmaxub        m15, m19
+    pminub        m20, m24
+    pminub        m21, m25
+    pminub        m22, m26
+    pminub        m23, m27
+    pmaxub         m8, m12
+    pmaxub         m9, m13
+    pmaxub        m10, m14
+    pmaxub        m11, m15
+    mov           r2d, 0xAAAAAAAA
+    kmovd          k1, r2d
+    vpshrdd       m24,  m0,  m1, 16
+    vpshrdd       m25,  m2,  m3, 16
+    vpshrdd       m12, m20, m21, 16
+    vpshrdd       m14, m22, m23, 16
+    vpshrdd       m16,  m8,  m9, 16
+    vpshrdd       m18, m10, m11, 16
+    vpblendmw m13{k1}, m20, m21
+    vpblendmw m15{k1}, m22, m23
+    vpblendmw m17{k1},  m8, m9
+    vpblendmw m19{k1}, m10, m11
+    vpblendmw m20{k1},  m0, m24
+    vpblendmw m21{k1}, m24, m1
+    vpblendmw m22{k1},  m2, m25
+    vpblendmw m23{k1}, m25, m3
+    vpshrdd        m4, m5, 16
+    vpshrdd        m6, m7, 16
+    pminub        m12, m13
+    pminub        m14, m15
+    pmaxub        m16, m17
+    pmaxub        m18, m19
+    mova           m8, [base+end_perm_w8clip]
+    vpcmpw         k2, m20, m31, 1
+    vpcmpw         k3, m22, m31, 1
+    vpshldw        m4, m21, 8
+    vpshldw        m6, m23, 8
+    kunpckdq       k1, k1, k1
+    kxnorb         k4, k4, k4
+    vpshrdw       m11, m12, m14, 8
+    vpshrdw       m15, m16, m18, 8
+    vpblendmb m13{k1}, m12, m14
+    vpblendmb m17{k1}, m16, m18
+    psubw         m21, m31, m20
+    psubw         m23, m31, m22
+    paddusw        m0, m20, m4  ; clip >0xff
+    paddusw        m1, m22, m6
+    pminub        m11, m13
+    pmaxub        m15, m17
+    psubusw    m0{k2}, m4, m21  ; clip <0x00
+    psubusw    m1{k3}, m6, m23
+    psrlw          m0, 8
+    vmovdqu8   m0{k1}, m1
+    pmaxub         m0, m11
+    pminub         m0, m15
+    vpermb         m0, m8, m0
+    add           r10, 2
+    vextracti32x4 xm1, m0, 1
+    vextracti32x4 xm2, m0, 2
+    vextracti32x4 xm3, m0, 3
+    movq   [dstq+strideq*0], xm0
+    movq   [dstq+strideq*2], xm1
+    movq   [r10 +strideq*0], xm2
+    movq   [r10 +strideq*2], xm3
+    movhps [dstq+strideq*1], xm0
+    movhps [dstq+r9       ], xm1
+    movhps [r10 +strideq*1], xm2
+    movhps [r10 +r9       ], xm3
+    RET
+.sec_only:
+    movifnidn     t1d, secm
+    call .sec
+.end_no_clip:
+    mova          xm8, [base+end_perm]
+    kxnorb         k1, k1, k1
+    vpshldd        m4, m0, 8  ; (px << 8) + ((sum > -8) << 4)
+    vpshldd        m5, m1, 8
+    vpshldd        m6, m2, 8
+    vpshldd        m7, m3, 8
+    paddw          m0, m4     ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+    paddw          m1, m5
+    paddw          m2, m6
+    paddw          m3, m7
+    vpermb         m0, m8, m0
+    vpermb         m1, m8, m1
+    vpermb         m2, m8, m2
+    vpermb         m3, m8, m3
+    add           r10, 2
+    punpckldq      m4, m0, m1
+    punpckhdq      m0, m1
+    punpckldq      m5, m2, m3
+    punpckhdq      m2, m3
+    movq   [dstq+strideq*0], xm4
+    movq   [dstq+strideq*2], xm0
+    movq   [r10 +strideq*0], xm5
+    movq   [r10 +strideq*2], xm2
+    movhps [dstq+strideq*1], xm4
+    movhps [dstq+r9       ], xm0
+    movhps [r10 +strideq*1], xm5
+    movhps [r10 +r9       ], xm2
+    RET
+.mask_edges_sec_only:
+    movifnidn     t1d, secm
+    call .mask_edges_sec
+    jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+    mov           t0d, r6d
+    mov           t1d, r6d
+    or            t0d, 0xA ; top-left 4x4 has bottom and right
+    or            t1d, 0x9 ; top-right 4x4 has bottom and left
+    vpbroadcastq  m26, [base+edge_mask+t0*8]
+    vpbroadcastq  m27, [base+edge_mask+t1*8]
+    mov           t1d, r6d
+    or            r6d, 0x6 ; bottom-left 4x4 has top and right
+    or            t1d, 0x5 ; bottom-right 4x4 has top and left
+    vpbroadcastq  m28, [base+edge_mask+r6*8]
+    vpbroadcastq  m29, [base+edge_mask+t1*8]
+    mov           t0d, dirm
+    test         prid, prid
+    jz .mask_edges_sec_only
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+    vpshufbitqmb   k1, m26, m20 ; index in-range
+    vpshufbitqmb   k2, m27, m20
+    vpshufbitqmb   k3, m28, m20
+    vpshufbitqmb   k4, m29, m20
+    mova           m8, m4
+    mova           m9, m5
+    mova          m10, m6
+    mova          m11, m7
+    vpermb     m8{k1}, m20, m12
+    vpermb     m9{k2}, m20, m13
+    vpermb    m10{k3}, m20, m14
+    vpermb    m11{k4}, m20, m15
+    mova   [rsp+0x00], m26
+    mova   [rsp+0x40], m27
+    mova   [rsp+0x80], m28
+    mova   [rsp+0xC0], m29
+    CDEF_FILTER_8x8_PRI
+    test          t1d, t1d
+    jz .end_no_clip
+    mova          m26, [rsp+0x00]
+    mova          m27, [rsp+0x40]
+    mova          m28, [rsp+0x80]
+    mova          m29, [rsp+0xC0]
+    call .mask_edges_sec
+    jmp .end_clip
+.mask_edges_sec:
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+    vpshufbitqmb   k1, m26, m20
+    vpshufbitqmb   k2, m27, m20
+    vpshufbitqmb   k3, m28, m20
+    vpshufbitqmb   k4, m29, m20
+    mova          m16, m4
+    mova          m17, m5
+    mova          m18, m6
+    mova          m19, m7
+    vpermb    m16{k1}, m20, m12
+    vpermb    m17{k2}, m20, m13
+    vpermb    m18{k3}, m20, m14
+    vpermb    m19{k4}, m20, m15
+    vpshufbitqmb   k1, m26, m21
+    vpshufbitqmb   k2, m27, m21
+    vpshufbitqmb   k3, m28, m21
+    vpshufbitqmb   k4, m29, m21
+    vpermb        m12, m21, m12
+    vpermb        m13, m21, m13
+    vpermb        m14, m21, m14
+    vpermb        m15, m21, m15
+    vpblendmb m12{k1}, m4, m12
+    vpblendmb m13{k2}, m5, m13
+    vpblendmb m14{k3}, m6, m14
+    vpblendmb m15{k4}, m7, m15
+    jmp .sec_main
+ALIGN function_align
+.sec:
+    vpaddd        m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+    vpaddd        m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+    vpermb        m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+    vpermb        m17, m20, m13 ; pNtr
+    vpermb        m18, m20, m14 ; pNbl
+    vpermb        m19, m20, m15 ; pNbr
+    vpermb        m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+    vpermb        m13, m21, m13 ; pNtr
+    vpermb        m14, m21, m14 ; pNbl
+    vpermb        m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+    vpcmpub        k1, m4, %1, 6
+    vpcmpub        k2, m5, %2, 6
+    vpcmpub        k3, m6, %3, 6
+    vpcmpub        k4, m7, %4, 6
+    psubb         m20, %1, m4
+    psubb         m21, %2, m5
+    psubb         m22, %3, m6
+    psubb         m23, %4, m7
+%if %5
+    vpbroadcastb  m28, t1d
+    lzcnt         t1d, t1d
+    vpbroadcastq  m29, [r3+t1*8]
+%endif
+    vpsubb    m20{k1}, m4, %1
+    vpsubb    m21{k2}, m5, %2
+    vpsubb    m22{k3}, m6, %3
+    vpsubb    m23{k4}, m7, %4
+    gf2p8affineqb m24, m20, m29, 0
+    gf2p8affineqb m25, m21, m29, 0
+    gf2p8affineqb m26, m22, m29, 0
+    gf2p8affineqb m27, m23, m29, 0
+%if %5
+    vpbroadcastd  m30, [base+sec_tap]
+%endif
+    psubusb       m24, m28, m24
+    psubusb       m25, m28, m25
+    psubusb       m26, m28, m26
+    psubusb       m27, m28, m27
+    pminub        m20, m24
+    pminub        m21, m25
+    pminub        m22, m26
+    pminub        m23, m27
+    mova          m24, m30
+    mova          m25, m30
+    mova          m26, m30
+    mova          m27, m30
+    vpsubb    m24{k1}, m31, m30
+    vpsubb    m25{k2}, m31, m30
+    vpsubb    m26{k3}, m31, m30
+    vpsubb    m27{k4}, m31, m30
+    vpdpbusd       m0, m20, m24
+    vpdpbusd       m1, m21, m25
+    vpdpbusd       m2, m22, m26
+    vpdpbusd       m3, m23, m27
+%endmacro
+    CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+    CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+    ret
+
+%endif ; HAVE_AVX512ICL && ARCH_X86_64
diff --git a/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c
index e9077fc7e..edc3b5d4b 100644
--- a/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/x86/cdef_init_tmpl.c
@@ -84,7 +84,9 @@ COLD void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
 
-#if BITDEPTH == 8
+#if HAVE_AVX512ICL && BITDEPTH == 8
+    c->fb[0] = dav1d_cdef_filter_8x8_avx512icl;
+    c->fb[1] = dav1d_cdef_filter_4x8_avx512icl;
     c->fb[2] = dav1d_cdef_filter_4x4_avx512icl;
 #endif
 
diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain.asm b/ffmpeg/JNI/dav1d/src/x86/film_grain.asm
index 5b596aba0..94ee123a9 100644
--- a/ffmpeg/JNI/dav1d/src/x86/film_grain.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/film_grain.asm
@@ -28,6 +28,8 @@
 %if ARCH_X86_64
 
 SECTION_RODATA 32
+pb_8x_27_17_8x_17_27: times 8 db 27, 17
+                      times 8 db 17, 27
 pw_1024: times 16 dw 1024
 pb_mask: db 0, 0x80, 0x80, 0, 0x80, 0, 0, 0x80, 0x80, 0, 0, 0x80, 0, 0x80, 0x80, 0
 rnd_next_upperbit_mask: dw 0x100B, 0x2016, 0x402C, 0x8058
@@ -60,6 +62,8 @@ pw_1: dw 1
 ALIGN 4
 JMP_TABLE generate_grain_y_avx2, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_420_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_avx2, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_avx2, 0, 1, 2, 3
 
 struc FGData
     .seed:                      resd 1
@@ -413,8 +417,9 @@ cglobal generate_grain_y, 2, 9, 16, buf, fg_data
     jg .y_loop_ar3
     RET
 
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
 INIT_XMM avx2
-cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
+cglobal generate_grain_uv_%1, 4, 10, 16, buf, bufy, fg_data, uv
     lea              r4, [pb_mask]
 %define base r4-pb_mask
     movq            xm1, [base+rnd_next_upperbit_mask]
@@ -428,11 +433,17 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     pxor            xm0, xm9
     vpbroadcastd    xm9, [base+pd_m65536]
     lea              r6, [gaussian_sequence]
-    mov             r7d, 38
+%if %2
+    mov             r7d, 73-35*%3
     add            bufq, 44
 .loop_y:
     mov              r5, -44
 .loop_x:
+%else
+    mov              r5, -73*82
+    sub            bufq, r5
+.loop:
+%endif
     pand            xm2, xm0, xm1
     psrlw           xm3, xm2, 10
     por             xm2, xm3            ; bits 0xf, 0x1e, 0x3c and 0x78 are set
@@ -455,15 +466,19 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     packsswb        xm2, xm2
     movd      [bufq+r5], xm2
     add              r5, 4
+%if %2
     jl .loop_x
     add            bufq, 82
     dec             r7d
     jg .loop_y
+%else
+    jl .loop
+%endif
 
     ; auto-regression code
     movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
-    movsxd           r5, [base+generate_grain_uv_420_avx2_table+r5*4]
-    lea              r5, [r5+base+generate_grain_uv_420_avx2_table]
+    movsxd           r5, [base+generate_grain_uv_%1_avx2_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_avx2_table]
     jmp              r5
 
 .ar0:
@@ -475,63 +490,126 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     movd            xm3, [base+hmul_bits+shiftq*2]
     DEFINE_ARGS buf, bufy, h
     pmovsxbw        xm4, xm4
+%if %2
     vpbroadcastd     m7, [pb_1]
-    vpbroadcastw     m6, [hmul_bits+4]
+    vpbroadcastw     m6, [hmul_bits+2+%3*2]
+%endif
     vpbroadcastw     m4, xm4
     vpbroadcastw     m3, xm3
-    sub            bufq, 82*38+82-(82*3+41)
+    pxor            m12, m12
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
     add           bufyq, 3+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar0:
+%if %2
     ; first 32 pixels
     movu            xm8, [bufyq]
+%if %3
     movu            xm9, [bufyq+82]
+%endif
     movu           xm10, [bufyq+16]
+%if %3
     movu           xm11, [bufyq+82+16]
+%endif
     vinserti128      m8, [bufyq+32], 1
+%if %3
     vinserti128      m9, [bufyq+82+32], 1
+%endif
     vinserti128     m10, [bufyq+48], 1
+%if %3
     vinserti128     m11, [bufyq+82+48], 1
+%endif
     pmaddubsw        m8, m7, m8
+%if %3
     pmaddubsw        m9, m7, m9
+%endif
     pmaddubsw       m10, m7, m10
+%if %3
     pmaddubsw       m11, m7, m11
     paddw            m8, m9
     paddw           m10, m11
+%endif
     pmulhrsw         m8, m6
     pmulhrsw        m10, m6
+%else
+    xor             r3d, r3d
+    ; first 32x2 pixels
+.x_loop_ar0:
+    movu             m8, [bufyq+r3]
+    pcmpgtb          m9, m12, m8
+    punpckhbw       m10, m8, m9
+    punpcklbw        m8, m9
+%endif
     pmullw           m8, m4
     pmullw          m10, m4
     pmulhrsw         m8, m3
     pmulhrsw        m10, m3
-    packsswb         m8, m10
+%if %2
     movu             m0, [bufq]
-    punpckhbw        m1, m0, m8
-    punpcklbw        m0, m8
-    pmaddubsw        m1, m7, m1
-    pmaddubsw        m0, m7, m0
-    packsswb         m0, m1
+%else
+    movu             m0, [bufq+r3]
+%endif
+    pcmpgtb          m1, m12, m0
+    punpckhbw        m9, m0, m1
+    punpcklbw        m0, m1
+    paddw            m0, m8
+    paddw            m9, m10
+    packsswb         m0, m9
+%if %2
     movu         [bufq], m0
+%else
+    movu      [bufq+r3], m0
+    add             r3d, 32
+    cmp             r3d, 64
+    jl .x_loop_ar0
+%endif
 
-    ; last 6 pixels
+    ; last 6/12 pixels
     movu            xm8, [bufyq+32*2]
+%if %2
+%if %3
     movu            xm9, [bufyq+32*2+82]
+%endif
     pmaddubsw       xm8, xm7, xm8
+%if %3
     pmaddubsw       xm9, xm7, xm9
     paddw           xm8, xm9
+%endif
     pmulhrsw        xm8, xm6
     pmullw          xm8, xm4
     pmulhrsw        xm8, xm3
-    packsswb        xm8, xm8
     movq            xm0, [bufq+32]
-    punpcklbw       xm8, xm0
-    pmaddubsw       xm8, xm7, xm8
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm9, xm0, xm9
+    paddw           xm8, xm9
     packsswb        xm8, xm8
     vpblendw        xm0, xm8, xm0, 1000b
     movq      [bufq+32], xm0
+%else
+    pcmpgtb         xm9, xm12, xm8
+    punpckhbw      xm10, xm8, xm9
+    punpcklbw       xm8, xm9
+    pmullw         xm10, xm4
+    pmullw          xm8, xm4
+    pmulhrsw       xm10, xm3
+    pmulhrsw        xm8, xm3
+    movu            xm0, [bufq+64]
+    pcmpgtb         xm9, xm12, xm0
+    punpcklbw       xm1, xm0, xm9
+    punpckhbw       xm9, xm0, xm9
+    paddw           xm1, xm8
+    paddw           xm9, xm10
+    packsswb        xm1, xm9
+    vpblendw        xm0, xm1, xm0, 11000000b
+    movu      [bufq+64], xm0
+%endif
 
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar0
     RET
@@ -549,27 +627,43 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     pshufd          xm5, xm4, q1111
     pshufd          xm4, xm4, q0000
     pmovsxwd        xm3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
     vpbroadcastd    xm7, [pb_1]
-    vpbroadcastw    xm6, [hmul_bits+4]
+    vpbroadcastw    xm6, [hmul_bits+2+%3*2]
+%endif
     vpbroadcastd    xm3, xm3
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
     mov            mind, -128
     mov            maxd, 127
 .y_loop_ar1:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
     movsx         val3d, byte [bufq+xq-1]
 .x_loop_ar1:
     pmovsxbw        xm0, [bufq+xq-82-1]     ; top/left
+%if %2
     movq            xm8, [bufyq+xq*2]
+%if %3
     movq            xm9, [bufyq+xq*2+82]
+%endif
+%endif
     psrldq          xm2, xm0, 2             ; top
     psrldq          xm1, xm0, 4             ; top/right
+%if %2
     pmaddubsw       xm8, xm7, xm8
+%if %3
     pmaddubsw       xm9, xm7, xm9
     paddw           xm8, xm9
+%endif
     pmulhrsw        xm8, xm6
+%else
+    pmovsxbw        xm8, [bufyq+xq]
+%endif
     punpcklwd       xm0, xm2
     punpcklwd       xm1, xm8
     pmaddwd         xm0, xm4
@@ -598,7 +692,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 
 .x_loop_ar1_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar1
     RET
@@ -611,8 +705,10 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     pmovsxbw        xm8, [fg_dataq+FGData.ar_coeffs_uv+uvq+0]   ; cf0-7
     pmovsxbw        xm9, [fg_dataq+FGData.ar_coeffs_uv+uvq+8]   ; cf8-12
     pinsrw          xm9, [base+pw_1], 5
-    vpbroadcastw    xm7, [base+hmul_bits+4]
+%if %2
+    vpbroadcastw    xm7, [base+hmul_bits+2+%3*2]
     vpbroadcastd    xm6, [base+pb_1]
+%endif
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
     pshufd         xm12, xm9, q0000
     pshufd         xm13, xm9, q1111
@@ -621,11 +717,15 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     pshufd         xm10, xm8, q2222
     pshufd          xm9, xm8, q1111
     pshufd          xm8, xm8, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar2:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar2:
     pmovsxbw        xm0, [bufq+xq-82*2-2]   ; y=-2,x=[-2,+5]
@@ -654,12 +754,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     paddd           xm2, xm3
     paddd           xm2, xm4
 
+%if %2
     movq            xm0, [bufyq+xq*2]
+%if %3
     movq            xm3, [bufyq+xq*2+82]
+%endif
     pmaddubsw       xm0, xm6, xm0
+%if %3
     pmaddubsw       xm3, xm6, xm3
     paddw           xm0, xm3
+%endif
     pmulhrsw        xm0, xm7
+%else
+    pmovsxbw        xm0, [bufyq+xq]
+%endif
     punpcklwd       xm0, xm15
     pmaddwd         xm0, xm14
     paddd           xm2, xm0
@@ -685,7 +793,7 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 
 .x_loop_ar2_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar2
     RET
@@ -730,14 +838,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     mova    [rsp+ 9*16], xm3
     mova    [rsp+10*16], xm4
     mova    [rsp+11*16], xm5
+%if %2
     vpbroadcastd   xm13, [base+pb_1]
-    vpbroadcastw   xm15, [base+hmul_bits+4]
+    vpbroadcastw   xm15, [base+hmul_bits+2+%3*2]
+%endif
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*70-(82-3)
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar3:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar3:
     movu            xm0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
@@ -800,12 +914,20 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
     palignr         xm9, xm5, xm2, 10
     palignr         xm5, xm5, xm2, 12
 
+%if %2
     movq            xm1, [bufyq+xq*2]
+%if %3
     movq            xm2, [bufyq+xq*2+82]
+%endif
     pmaddubsw       xm1, xm13, xm1
+%if %3
     pmaddubsw       xm2, xm13, xm2
     paddw           xm1, xm2
+%endif
     pmulhrsw        xm1, xm15
+%else
+    pmovsxbw        xm1, [bufyq+xq]
+%endif
 
     punpcklwd       xm6, xm7
     punpcklwd       xm8, xm9
@@ -841,10 +963,15 @@ cglobal generate_grain_uv_420, 4, 10, 16, buf, bufy, fg_data, uv
 
 .x_loop_ar3_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar3
     RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
 
 INIT_YMM avx2
 cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
@@ -1188,9 +1315,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
     jz .end_y_v_overlap
     ; 2 lines get vertical overlap, then fall back to non-overlap code for
     ; remaining (up to) 30 lines
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .loop_y_v_overlap
+    btc              hd, 16
+    jnc .loop_y_v_overlap
     jmp .loop_y
 
 .end_y_v_overlap:
@@ -1321,9 +1447,8 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
     jz .end_y_hv_overlap
     ; 2 lines get vertical overlap, then fall back to non-overlap code for
     ; remaining (up to) 30 lines
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .loop_y_hv_overlap
+    btc              hd, 16
+    jnc .loop_y_hv_overlap
     jmp .loop_y_h_overlap
 
 .end_y_hv_overlap:
@@ -1334,8 +1459,9 @@ cglobal fgy_32x32xn, 6, 13, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 .end_hv:
     RET
 
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
-                                      grain_lut, h, sby, luma, lstride, uv_pl, is_id
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, h, sby, luma, lstride, uv_pl, is_id
     pcmpeqw         m10, m10
     psrld           m10, 24
     mov             r7d, [fg_dataq+FGData.scaling_shift]
@@ -1351,7 +1477,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
     jne .csfl
 
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
     DEFINE_ARGS dst, src, stride, fg_data, w, scaling, grain_lut, unused, sby, see, overlap
 
 %if %1
@@ -1362,7 +1488,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     vpbroadcastw    m15, [fg_dataq+FGData.uv_offset+r7*4]
 %else
     vpbroadcastd    m14, [pw_1024]
+%if %2
     vpbroadcastd    m15, [pb_23_22]
+%else
+    vpbroadcastd   xm15, [pb_27_17_17_27]
+%endif
 %endif
 
     mov        overlapd, [fg_dataq+FGData.overlap_flag]
@@ -1384,7 +1514,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     mov           lumaq, r9mp
     lea             r12, [srcq+wq]
     lea             r13, [dstq+wq]
-    lea             r14, [lumaq+wq*2]
+    lea             r14, [lumaq+wq*(1+%2)]
     mov           r11mp, r12
     mov           r12mp, r13
     mov        lstrideq, r10mp
@@ -1405,8 +1535,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     rorx          offyd, seed, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, overlap, unused1, unused2, lstride
@@ -1415,21 +1545,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     mov      grain_lutq, grain_lutmp
 %%loop_y:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    pxor             m2, m2
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1441,6 +1579,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1469,8 +1610,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     punpcklbw        m0, m2                 ; m0-1: src as word
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %2
     movu            xm3, [grain_lutq+offxyq+ 0]
     vinserti128      m3, [grain_lutq+offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1489,21 +1634,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pminsw           m0, m12
     pminsw           m1, m12
     packuswb         m0, m1
+%if %2
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
     jg %%loop_y
 
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
     test       overlapd, overlapd
@@ -1525,13 +1680,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, unused1, unused2, lstride
 
-    lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+    lea     left_offxyd, [offyd+(32>>%2)]         ; previous column's offy*stride+offx
     mov           offxd, seed
     rorx          offyd, seed, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, left_offxy, unused1, unused2, lstride
@@ -1540,21 +1695,29 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     mov      grain_lutq, grain_lutmp
 %%loop_y_h_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1566,6 +1729,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1594,6 +1760,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     punpcklbw        m0, m2                 ; m0-1: src as word
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %2
 %if %1
     vpbroadcastd     m6, [pb_23_22] ; FIXME
 %endif
@@ -1613,6 +1780,25 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pcmpeqw          m6, m6 ; FIXME
     psrldq           m6, 15 ; FIXME
     vpblendvb        m3, m3, m4, m6
+%else
+%if %1
+    vpbroadcastd    xm6, [pb_27_17_17_27]
+%endif
+    movu             m3, [grain_lutq+offxyq]
+    movd            xm4, [grain_lutq+left_offxyq]
+    punpcklbw       xm4, xm3
+%if %1
+    pmaddubsw       xm4, xm6, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm6, xm6
+    psrldq          xm6, 14
+    vpblendvb        m3, m3, m4, m6
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1631,21 +1817,31 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pminsw           m0, m12
     pminsw           m1, m12
     packuswb         m0, m1
+%if %2
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82*(1+%2)
+    sub              hb, 1+%2
     jg %%loop_y_h_overlap
 
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
 
@@ -1678,7 +1874,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     mov           lumaq, r9mp
     lea             r12, [srcq+wq]
     lea             r13, [dstq+wq]
-    lea             r14, [lumaq+wq*2]
+    lea             r14, [lumaq+wq*(1+%2)]
     mov           r11mp, r12
     mov           r12mp, r13
     mov        lstrideq, r10mp
@@ -1705,9 +1901,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     rorx          offxd, seed, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, overlap, top_offxy, unused, lstride
@@ -1717,23 +1913,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov              hd, hm
     mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
 %%loop_y_v_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1745,6 +1952,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1768,11 +1978,42 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packusdw         m8, m4
     packusdw         m5, m6
 
+%if %2
     ; unpack chroma_source
     punpckhbw        m1, m0, m2
     punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
 
     ; grain = grain_lut[offy+y][offx+x]
+%if %3 == 0
+%if %2
+    mova             m6, [pb_8x_27_17_8x_17_27]
+    movu            xm3, [grain_lutq+offxyq]
+    movu            xm4, [grain_lutq+top_offxyq]
+    vinserti128      m3, [grain_lutq+offxyq+82], 1
+    vinserti128      m4, [grain_lutq+top_offxyq+82], 1
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m4, [grain_lutq+top_offxyq]
+%endif
+    punpckhbw        m9, m4, m3
+    punpcklbw        m4, m3
+%if %2
+    pmaddubsw        m9, m6, m9
+    pmaddubsw        m4, m6, m4
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m4, m1, m4
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m4, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m4, m14
+%endif
+    packsswb         m3, m4, m9
+%else
 %if %1
     vpbroadcastd     m6, [pb_23_22]
 %endif
@@ -1792,6 +2033,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     vpermq           m4, m4, q3120
     ; only interpolate first line, insert second line unmodified
     vinserti128      m3, m4, [grain_lutq+offxyq+82], 1
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1803,6 +2045,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmulhrsw         m3, m11
 
     ; dst = clip_pixel(src, noise)
+%if %2
     paddw            m0, m2
     paddw            m1, m3
     pmaxsw           m0, m13
@@ -1812,21 +2055,46 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m0, m1
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
 
-    sub              hb, 2
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
+
+    sub              hb, 1+%2
     jl %%end_y_v_overlap
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_v_overlap
+%endif
     jmp %%loop_y
 
 %%end_y_v_overlap:
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end_hv
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
 
@@ -1851,15 +2119,15 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, top_offxy, topleft_offxy, lstride
 
-    lea  topleft_offxyq, [top_offxyq+16]
-    lea     left_offxyq, [offyq+16]
+    lea  topleft_offxyq, [top_offxyq+(32>>%2)]
+    lea     left_offxyq, [offyq+(32>>%2)]
     rorx          offyd, seed, 8
     rorx          offxd, seed, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 h, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
@@ -1869,23 +2137,34 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov              hd, hm
     mov      grain_lutq, grain_lutmp
+%if %2 == 0
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27]
+%endif
 %%loop_y_hv_overlap:
     ; src
+%if %2
     mova            xm4, [lumaq+lstrideq*0+ 0]
     mova            xm6, [lumaq+lstrideq*0+16]
     mova            xm0, [srcq]
     vpbroadcastd     m7, [pb_1]
-    vinserti128      m4, [lumaq+lstrideq*2 +0], 1
-    vinserti128      m6, [lumaq+lstrideq*2+16], 1
+    vinserti128      m4, [lumaq+lstrideq*(1+%3) +0], 1
+    vinserti128      m6, [lumaq+lstrideq*(1+%3)+16], 1
     vinserti128      m0, [srcq+strideq], 1
     pxor             m2, m2
     pmaddubsw        m4, m7
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -1897,6 +2176,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     punpckhwd        m5, m4, m2
@@ -1920,44 +2202,94 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packusdw         m8, m4
     packusdw         m5, m6
 
+%if %2
     ; unpack chroma source
     punpckhbw        m1, m0, m2
     punpcklbw        m0, m2                 ; m0-1: src as word
+%endif
 
     ; grain = grain_lut[offy+y][offx+x]
 %if %1
+%if %2
     vpbroadcastd     m9, [pb_23_22]
+%else
+    vpbroadcastd    xm9, [pb_27_17_17_27]
 %endif
+%endif
+
+%if %2
     movu            xm3, [grain_lutq+offxyq]
+%if %3
     movq            xm6, [grain_lutq+top_offxyq]
+%else
+    movu            xm6, [grain_lutq+top_offxyq]
+%endif
     vinserti128      m3, [grain_lutq+offxyq+82], 1
+%if %3
     vinserti128      m6, [grain_lutq+top_offxyq+8], 1
+%else
+    vinserti128      m6, [grain_lutq+top_offxyq+82], 1
+%endif
+%else
+    movu             m3, [grain_lutq+offxyq]
+    movu             m6, [grain_lutq+top_offxyq]
+%endif
     movd            xm4, [grain_lutq+left_offxyq]
     movd            xm7, [grain_lutq+topleft_offxyq]
+%if %2
     vinserti128      m4, [grain_lutq+left_offxyq+82], 1
+%if %3 == 0
+    vinserti128      m7, [grain_lutq+topleft_offxyq+82], 1
+%endif
+%endif
+
     ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
+%if %2
     punpcklbw        m4, m3
+%if %3
     punpcklbw       xm7, xm6
+%else
+    punpcklbw        m7, m6
+%endif
+    punpcklwd        m4, m7
 %if %1
     pmaddubsw        m4, m9, m4
-    pmaddubsw       xm7, xm9, xm7
     pmulhrsw         m4, [pw_1024]
-    pmulhrsw        xm7, [pw_1024]
 %else
     pmaddubsw        m4, m15, m4
-    pmaddubsw       xm7, xm15, xm7
     pmulhrsw         m4, m14
-    pmulhrsw        xm7, xm14
 %endif
     packsswb         m4, m4
-    packsswb        xm7, xm7
     pcmpeqw          m9, m9                 ; this is kind of ugly
     psrldq           m9, 15
     vpblendvb        m3, m3, m4, m9
-    shufpd           m9, m9, m9, 1110b
-    vpblendvb        m6, m6, m7, m9
-    vpermq           m9, m3, q3120
+    psrldq           m4, 1
+%if %3
+    shufpd           m9, m9, m9, 1110b      ; clear upper lane
+%endif
+    vpblendvb        m6, m6, m4, m9
+%else
+    punpcklbw       xm4, xm3
+    punpcklbw       xm7, xm6
+    punpckldq       xm4, xm7
+%if %1
+    pmaddubsw       xm4, xm9, xm4
+    pmulhrsw        xm4, [pw_1024]
+%else
+    pmaddubsw       xm4, xm15, xm4
+    pmulhrsw        xm4, xm14
+%endif
+    packsswb        xm4, xm4
+    pcmpeqw         xm9, xm9                 ; this is kind of ugly
+    psrldq          xm9, 14
+    vpblendvb        m3, m3, m4, m9
+    psrldq          xm4, 2
+    vpblendvb        m6, m6, m4, m9
+%endif
+
     ; followed by v interpolation (top | cur -> cur)
+%if %3
+    vpermq           m9, m3, q3120
     punpcklbw        m6, m9
 %if %1
     vpbroadcastd     m9, [pb_23_22]
@@ -1970,6 +2302,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packsswb         m6, m6
     vpermq           m6, m6, q3120
     vpblendd         m3, m3, m6, 00001111b
+%else
+    punpckhbw        m9, m6, m3
+    punpcklbw        m6, m3
+%if %2
+    mova             m3, [pb_8x_27_17_8x_17_27]
+    pmaddubsw        m9, m3, m9
+    pmaddubsw        m6, m3, m6
+%else
+    pmaddubsw        m9, m1, m9
+    pmaddubsw        m6, m1, m6
+%endif
+%if %1
+    pmulhrsw         m9, [pw_1024]
+    pmulhrsw         m6, [pw_1024]
+%else
+    pmulhrsw         m9, m14
+    pmulhrsw         m6, m14
+%endif
+    packsswb         m3, m6, m9
+%endif
     pcmpgtb          m7, m2, m3
     punpcklbw        m2, m3, m7
     punpckhbw        m3, m7
@@ -1981,6 +2333,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmulhrsw         m3, m11
 
     ; dst = clip_pixel(src, noise)
+%if %2
     paddw            m0, m2
     paddw            m1, m3
     pmaxsw           m0, m13
@@ -1990,20 +2343,47 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m0, m1
     mova         [dstq], xm0
     vextracti128 [dstq+strideq], m0, 1
+%else
+    pxor             m6, m6
+    punpckhbw        m9, m0, m6
+    punpcklbw        m0, m6                 ; m0-1: src as word
+    paddw            m0, m2
+    paddw            m9, m3
+    pmaxsw           m0, m13
+    pmaxsw           m9, m13
+    pminsw           m0, m12
+    pminsw           m9, m12
+    packuswb         m0, m9
+    mova         [dstq], m0
+%endif
 
+%if %2
     lea            srcq, [srcq+strideq*2]
     lea            dstq, [dstq+strideq*2]
-    lea           lumaq, [lumaq+lstrideq*4]
-    add      grain_lutq, 82*2
-    sub              hb, 2
+    lea           lumaq, [lumaq+lstrideq*(2<<%3)]
+%else
+    add            srcq, strideq
+    add            dstq, strideq
+    add           lumaq, lstrideq
+%endif
+    add      grain_lutq, 82<<%2
+    sub              hb, 1+%2
+%if %2
     jg %%loop_y_h_overlap
+%else
+    je %%end_y_hv_overlap
+    vbroadcasti128   m1, [pb_8x_27_17_8x_17_27+16]
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+    jmp %%loop_y_h_overlap
+%endif
 
 %%end_y_hv_overlap:
-    add              wq, 16
+    add              wq, 32>>%2
     jge %%end_hv
     mov            srcq, r11mp
     mov            dstq, r12mp
-    lea           lumaq, [r14+wq*2]
+    lea           lumaq, [r14+wq*(1+%2)]
     add            srcq, wq
     add            dstq, wq
     jmp %%loop_x_hv_overlap
@@ -2012,8 +2392,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     RET
 %endmacro
 
-    FGUV_32x32xN_LOOP 1
+    %%FGUV_32x32xN_LOOP 1, %2, %3
 .csfl:
-    FGUV_32x32xN_LOOP 0
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+FGUV_FN 422, 1, 0
+FGUV_FN 444, 0, 0
 
 %endif ; ARCH_X86_64
diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c
index 30bb52d06..25e8ef99e 100644
--- a/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/x86/film_grain_init_tmpl.c
@@ -30,13 +30,21 @@
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_ssse3);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_ssse3);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_ssse3);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_ssse3);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_ssse3);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_ssse3);
 
 decl_generate_grain_y_fn(dav1d_generate_grain_y_avx2);
 decl_generate_grain_uv_fn(dav1d_generate_grain_uv_420_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_422_avx2);
+decl_generate_grain_uv_fn(dav1d_generate_grain_uv_444_avx2);
 decl_fgy_32x32xn_fn(dav1d_fgy_32x32xn_avx2);
 decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i420_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i422_avx2);
+decl_fguv_32x32xn_fn(dav1d_fguv_32x32xn_i444_avx2);
 
 COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -46,8 +54,12 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
 #if BITDEPTH == 8
     c->generate_grain_y = dav1d_generate_grain_y_ssse3;
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_ssse3;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_ssse3;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_ssse3;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_ssse3;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_ssse3;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
@@ -55,7 +67,11 @@ COLD void bitfn(dav1d_film_grain_dsp_init_x86)(Dav1dFilmGrainDSPContext *const c
 #if BITDEPTH == 8 && ARCH_X86_64
     c->generate_grain_y = dav1d_generate_grain_y_avx2;
     c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_generate_grain_uv_420_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_generate_grain_uv_422_avx2;
+    c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_generate_grain_uv_444_avx2;
     c->fgy_32x32xn = dav1d_fgy_32x32xn_avx2;
     c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_fguv_32x32xn_i420_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_fguv_32x32xn_i422_avx2;
+    c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_fguv_32x32xn_i444_avx2;
 #endif
 }
diff --git a/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm
index 6402cec51..8212846f2 100644
--- a/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/film_grain_ssse3.asm
@@ -60,6 +60,8 @@ pw_1: dw 1
 
 JMP_TABLE generate_grain_y_ssse3, 0, 1, 2, 3
 JMP_TABLE generate_grain_uv_420_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_422_ssse3, 0, 1, 2, 3
+JMP_TABLE generate_grain_uv_444_ssse3, 0, 1, 2, 3
 
 struc FGData
     .seed:                      resd 1
@@ -502,8 +504,9 @@ cglobal generate_grain_y, 2, 7 + 2 * ARCH_X86_64, 16, buf, fg_data
     jg .y_loop_ar3
     RET
 
+%macro generate_grain_uv_fn 3 ; ss_name, ss_x, ss_y
 INIT_XMM ssse3
-cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
+cglobal generate_grain_uv_%1, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, uv
     movifnidn        r2, r2mp
     movifnidn        r3, r3mp
     LEA              r4, $$
@@ -520,15 +523,21 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     pshuflw          m6, m6, q0000
     pshuflw          m0, m0, q0000
     lea              r6, [base+gaussian_sequence]
+%if %2
 %if ARCH_X86_64
-    mov             r7d, 38
+    mov             r7d, 73-35*%3
 %else
-    mov            r3mp, 38
+    mov            r3mp, 73-35*%3
 %endif
     add            bufq, 44
 .loop_y:
     mov              r5, -44
 .loop_x:
+%else
+    mov              r5, -82*73
+    sub            bufq, r5
+.loop:
+%endif
     pand             m2, m0, m1
     psrlw            m3, m2, 10
     por              m2, m3             ; bits 0xf, 0x1e, 0x3c and 0x78 are set
@@ -577,6 +586,7 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     packsswb         m3, m3
     movd      [bufq+r5], m3
     add              r5, 4
+%if %2
     jl .loop_x
     add            bufq, 82
 %if ARCH_X86_64
@@ -585,6 +595,9 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     dec            r3mp
 %endif
     jg .loop_y
+%else
+    jl .loop
+%endif
 
 %if ARCH_X86_32
     mov              r2, r2mp
@@ -592,8 +605,8 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 
     ; auto-regression code
     movsxd           r5, [fg_dataq+FGData.ar_coeff_lag]
-    movsxd           r5, [base+generate_grain_uv_420_ssse3_table+r5*4]
-    lea              r5, [r5+base+generate_grain_uv_420_ssse3_table]
+    movsxd           r5, [base+generate_grain_uv_%1_ssse3_table+r5*4]
+    lea              r5, [r5+base+generate_grain_uv_%1_ssse3_table]
     jmp              r5
 
 .ar0:
@@ -607,79 +620,130 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
     movd             m5, [fg_dataq+FGData.ar_coeffs_uv+uvq]
     movd             m4, [base+hmul_bits+shiftq*2]
-    movd             m1, [base+byte_blend]
-    DEFINE_ARGS buf, bufy, h
+    DEFINE_ARGS buf, bufy, h, x
     pxor             m0, m0
     pcmpgtb          m0, m5
     punpcklbw        m5, m0
     movd             m7, [base+pb_1]
-    movd             m6, [base+hmul_bits+4]
+%if %2
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
     pshuflw          m5, m5, q0000
     pshuflw          m4, m4, q0000
     pshufd           m7, m7, q0000
+%if %2
     pshuflw          m6, m6, q0000
+%endif
     punpcklqdq       m5, m5
     punpcklqdq       m4, m4
+%if %2
     punpcklqdq       m6, m6
-    punpcklbw        m1, m1
+%endif
+    pcmpeqw          m1, m1
+    pslldq           m1, 12>>%2
     SCRATCH           1, 8, 0
     SCRATCH           4, 9, 1
-    sub            bufq, 82*38+82-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+82-(82*3+41)
+%else
+    sub            bufq, 82*70-3
+%endif
     add           bufyq, 3+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar0:
+    xor              xd, xd
+.x_loop_ar0:
     ; first 32 pixels
-    movu             m1, [bufyq]
-    movu             m2, [bufyq+82]
-    movu             m3, [bufyq+16]
-    movu             m4, [bufyq+82+16]
+%if %2
+    movu             m1, [bufyq+xq*2]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
+    movu             m3, [bufyq+xq*2+16]
+%if %3
+    movu             m4, [bufyq+xq*2+82+16]
+%endif
     pmaddubsw        m0, m7, m1
+%if %3
     pmaddubsw        m1, m7, m2
+%endif
     pmaddubsw        m2, m7, m3
+%if %3
     pmaddubsw        m3, m7, m4
     paddw            m0, m1
     paddw            m2, m3
+%endif
     pmulhrsw         m0, m6
     pmulhrsw         m2, m6
+%else
+    movu             m0, [bufyq+xq]
+    pxor             m6, m6
+    pcmpgtb          m6, m0
+    punpckhbw        m2, m0, m6
+    punpcklbw        m0, m6
+%endif
     pmullw           m0, m5
     pmullw           m2, m5
     pmulhrsw         m0, m9
     pmulhrsw         m2, m9
+    movu             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpckhbw        m3, m1, m4
+%if %2
+    punpcklbw        m1, m4
+    paddw            m2, m3
+    paddw            m0, m1
+%else
+    punpcklbw        m6, m1, m4
+    paddw            m2, m3
+    paddw            m0, m6
+%endif
     packsswb         m0, m2
-    movu             m1, [bufq]
-    punpckhbw        m2, m0, m1
-    punpcklbw        m0, m1
-    pmaddubsw        m1, m7, m2
-    pmaddubsw        m2, m7, m0
-    packsswb         m2, m1
-    movu         [bufq], m2
-    add           bufyq, 32
-    add            bufq, 16
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .y_loop_ar0
-
-    ; last 6 pixels
-    movu             m1, [bufyq]
-    movu             m2, [bufyq+82]
+%if %2
+    movu      [bufq+xq], m0
+    add              xd, 16
+    cmp              xd, 32
+    jl .x_loop_ar0
+
+    ; last 6/12 pixels
+    movu             m1, [bufyq+xq*(1+%2)]
+%if %3
+    movu             m2, [bufyq+xq*2+82]
+%endif
     pmaddubsw        m0, m7, m1
+%if %3
     pmaddubsw        m1, m7, m2
     paddw            m0, m1
+%endif
     pmulhrsw         m0, m6
     pmullw           m0, m5
     pmulhrsw         m0, m9
+    movq             m1, [bufq+xq]
+    pxor             m4, m4
+    pcmpgtb          m4, m1
+    punpcklbw        m2, m1, m4
+    paddw            m0, m2
     packsswb         m0, m0
-    movq             m1, [bufq]
-    punpcklbw        m0, m1
-    pmaddubsw        m2, m7, m0
-    packsswb         m2, m2
-    pandn            m0, m8, m2
+    pandn            m2, m8, m0
+    pand             m1, m8
+    por              m2, m1
+    movq      [bufq+xq], m2
+%else
+    add              xd, 16
+    cmp              xd, 80
+    je .y_loop_final_ar0
+    movu   [bufq+xq-16], m0
+    jmp .x_loop_ar0
+.y_loop_final_ar0:
+    pandn            m2, m8, m0
     pand             m1, m8
-    por              m0, m1
-    movq         [bufq], m0
+    por              m2, m1
+    movu   [bufq+xq-16], m2
+%endif
 
-    add            bufq, 82-32
-    add           bufyq, 82*2-64
+    add            bufq, 82
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar0
     RET
@@ -706,8 +770,10 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 %endif
     mov          shiftd, [fg_dataq+FGData.ar_coeff_shift]
     movd             m3, [base+round_vals+shiftq*2-12]    ; rnd
+%if %2
     movd             m7, [base+pb_1]
-    movd             m6, [base+hmul_bits+4]
+    movd             m6, [base+hmul_bits+2+%3*2]
+%endif
     psrldq           m4, 1
 %if ARCH_X86_32
     DEFINE_ARGS buf, shift, val0, val3, min, max, x
@@ -718,40 +784,64 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 %endif
     pxor             m5, m5
     punpcklwd        m3, m5
+%if %2
     punpcklwd        m6, m6
+%endif
     pcmpgtb          m5, m4
     punpcklbw        m4, m5
     pshufd           m5, m4, q1111
     pshufd           m4, m4, q0000
     pshufd           m3, m3, q0000
+%if %2
     pshufd           m7, m7, q0000
     pshufd           m6, m6, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
 %if ARCH_X86_32
     add            r1mp, 79+82*3
-    mov            r0mp, 35
+    mov            r0mp, 70-35*%3
 %else
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 %endif
     mov            mind, -128
     mov            maxd, 127
 .y_loop_ar1:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
     movsx         val3d, byte [bufq+xq-1]
 .x_loop_ar1:
+%if %2
 %if ARCH_X86_32
     mov              r2, r1mp
     movq             m0, [r2+xq*2]
+%if %3
     movq             m1, [r2+xq*2+82]
+%endif
 %else
     movq             m0, [bufyq+xq*2]
+%if %3
     movq             m1, [bufyq+xq*2+82]
+%endif
 %endif
     pmaddubsw        m2, m7, m0
+%if %3
     pmaddubsw        m0, m7, m1
     paddw            m2, m0
+%endif
     pmulhrsw         m2, m6
+%else
+%if ARCH_X86_32
+    mov              r2, r1mp
+    movd             m2, [r2+xq]
+%else
+    movd             m2, [bufyq+xq]
+%endif
+    pxor             m0, m0
+    pcmpgtb          m0, m2
+    punpcklbw        m2, m0
+%endif
 
     movq             m0, [bufq+xq-82-1]     ; top/left
     pxor             m1, m1
@@ -792,10 +882,10 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 .x_loop_ar1_end:
     add            bufq, 82
 %if ARCH_X86_32
-    add            r1mp, 82*2
+    add            r1mp, 82<<%3
     dec            r0mp
 %else
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
 %endif
     jg .y_loop_ar1
@@ -837,16 +927,20 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     SCRATCH           5, 13, 5
     SCRATCH           6, 14, 6
     SCRATCH           7, 15, 7
-    movd             m7, [base+hmul_bits+4]
+%if %2
+    movd             m7, [base+hmul_bits+2+%3*2]
     movd             m6, [base+pb_1]
     punpcklwd        m7, m7
     pshufd           m6, m6, q0000
     pshufd           m7, m7, q0000
-    sub            bufq, 82*38+44-(82*3+41)
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar2:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar2:
     pxor             m2, m2
@@ -879,12 +973,23 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     paddd            m2, m3
     paddd            m2, m4
 
-    movq             m0, [bufyq+xq*2]
+%if %2
+    movq             m1, [bufyq+xq*2]
+%if %3
     movq             m3, [bufyq+xq*2+82]
-    pmaddubsw        m1, m6, m0
-    pmaddubsw        m0, m6, m3
+%endif
+    pmaddubsw        m0, m6, m1
+%if %3
+    pmaddubsw        m1, m6, m3
     paddw            m0, m1
+%endif
     pmulhrsw         m0, m7
+%else
+    movd             m0, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m0
+    punpcklbw        m0, m1
+%endif
     punpcklwd        m0, m15
     pmaddwd          m0, m14
     paddd            m2, m0
@@ -914,7 +1019,7 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 
 .x_loop_ar2_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar2
     RET
@@ -977,24 +1082,36 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     SCRATCH           5, 12, 11
 
     movd             m2, [base+round_vals-12+shiftq*2]
+%if %2
     movd             m1, [base+pb_1]
-    movd             m3, [base+hmul_bits+4]
+    movd             m3, [base+hmul_bits+2+%3*2]
+%endif
     pxor             m0, m0
     punpcklwd        m2, m0
+%if %2
     punpcklwd        m3, m3
+%endif
     pshufd           m2, m2, q0000
+%if %2
     pshufd           m1, m1, q0000
     pshufd           m3, m3, q0000
     SCRATCH           1, 13, 12
+%endif
     SCRATCH           2, 14, 13
+%if %2
     SCRATCH           3, 15, 14
+%endif
 
     DEFINE_ARGS buf, bufy, fg_data, h, unused, x
-    sub            bufq, 82*38+44-(82*3+41)
+%if %2
+    sub            bufq, 82*(73-35*%3)+44-(82*3+41)
+%else
+    sub            bufq, 82*69+3
+%endif
     add           bufyq, 79+82*3
-    mov              hd, 35
+    mov              hd, 70-35*%3
 .y_loop_ar3:
-    mov              xq, -38
+    mov              xq, -(76>>%2)
 
 .x_loop_ar3:
     movu             m0, [bufq+xq-82*3-3]   ; y=-3,x=[-3,+12]
@@ -1058,12 +1175,23 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
     paddd            m3, m5
     paddd            m0, m3
 
+%if %2
     movq             m1, [bufyq+xq*2]
+%if %3
     movq             m3, [bufyq+xq*2+82]
-    pmaddubsw        m5, m13, m1
-    pmaddubsw        m7, m13, m3
+%endif
+    pmaddubsw        m7, m13, m1
+%if %3
+    pmaddubsw        m5, m13, m3
     paddw            m7, m5
+%endif
     pmulhrsw         m7, m15
+%else
+    movd             m7, [bufyq+xq]
+    pxor             m1, m1
+    pcmpgtb          m1, m7
+    punpcklbw        m7, m1
+%endif
 
     psrldq           m1, m2, 4
     psrldq           m3, m2, 6
@@ -1110,10 +1238,15 @@ cglobal generate_grain_uv_420, 1, 7 + 3 * ARCH_X86_64, 16, buf, bufy, fg_data, u
 
 .x_loop_ar3_end:
     add            bufq, 82
-    add           bufyq, 82*2
+    add           bufyq, 82<<%3
     dec              hd
     jg .y_loop_ar3
     RET
+%endmacro
+
+generate_grain_uv_fn 420, 1, 1
+generate_grain_uv_fn 422, 1, 0
+generate_grain_uv_fn 444, 0, 0
 
 %macro vpgatherdw 5-6 ; dst, src, base, tmp_gpr[x2], tmp_xmm_reg
 %assign %%idx 0
@@ -1359,13 +1492,11 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 %if ARCH_X86_32
     mov            srcq, r1mp
     add            srcq, r4mp
-    xor            r8mp, 4
-    test           r8mp, 4
 %else
     lea            srcq, [src_bakq+wq]
-    test           srcq, 16             ; this relies on buffer alignment...
 %endif
-    jz .next_blk
+    btc       dword r8m, 2
+    jc .next_blk
 
     add          offxyd, 16
     test      dword r8m, 2              ; r8m & 2 = have_top_overlap
@@ -1507,11 +1638,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 %if ARCH_X86_32
     mov            srcq, r1m
     add            srcq, r4m
-    xor            r8mp, 4
 %else
     lea            srcq, [src_bakq+wq]
 %endif
-    ; assert(srcq & 16) != 0
+    xor       dword r8m, 4
     add          offxyd, 16
 
     ; since this half-block had left-overlap, the next does not
@@ -1712,9 +1842,8 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
     jz .end_y_v_overlap
     ; 2 lines get vertical overlap, then fall back to non-overlap code for
     ; remaining (up to) 30 lines
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .loop_y_v_overlap
+    btc              hd, 16
+    jnc .loop_y_v_overlap
     jmp .loop_y
 
 .end_y_v_overlap:
@@ -1727,13 +1856,11 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 %if ARCH_X86_32
     mov            srcq, r1mp
     add            srcq, r4mp
-    xor            r8mp, 4
-    test           r8mp, 4
 %else
     lea            srcq, [src_bakq+wq]
-    test           srcq, 16
 %endif
-    jz .loop_x_hv_overlap
+    btc       dword r8m, 2
+    jc .loop_x_hv_overlap
     add          offxyd, 16
 %if ARCH_X86_32
     add dword [rsp+6*mmsize+1*gprsize], 16
@@ -1915,9 +2042,8 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
     jz .end_y_hv_overlap
     ; 2 lines get vertical overlap, then fall back to non-overlap code for
     ; remaining (up to) 30 lines
-    xor              hd, 0x10000
-    test             hd, 0x10000
-    jnz .loop_y_hv_overlap
+    btc              hd, 16
+    jnc .loop_y_hv_overlap
     jmp .loop_y_h_overlap
 
 .end_y_hv_overlap:
@@ -1930,11 +2056,10 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 %if ARCH_X86_32
     mov            srcq, r1m
     add            srcq, r4m
-    xor            r8mp, 4
 %else
     lea            srcq, [src_bakq+wq]
 %endif
-    ; assert(srcq & 16) != 0
+    xor       dword r8m, 4
     add          offxyd, 16
 %if ARCH_X86_32
     add dword [rsp+6*mmsize+1*gprsize], 16
@@ -1946,13 +2071,14 @@ cglobal fgy_32x32xn, 6, 15, 16, dst, src, stride, fg_data, w, scaling, grain_lut
 .end_hv:
     RET
 
+%macro FGUV_FN 3 ; name, ss_hor, ss_ver
 INIT_XMM ssse3
 %if ARCH_X86_32
 ; fguv_32x32xn_i420_ssse3(dst, src, stride, fg_data, w, scaling, grain_lut, h,
 ;                         sby, luma, lstride, uv_pl, is_id)
 %if STACK_ALIGNMENT < mmsize
 DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8
-cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
         tmp, src, scaling, h, fg_data, picptr, unused
     mov              r0, r0m
     mov              r1, r2m
@@ -1975,7 +2101,7 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 0 - (8 * mmsize + (13 + 3) * gprsize), \
     mov [rsp+8*mmsize+13*gprsize], r2
     mov [rsp+8*mmsize+14*gprsize], r4
 %else
-cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
+cglobal fguv_32x32xn_i%1, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
         tmp, src, scaling, h, fg_data, picptr, unused
 %endif
     mov            srcq, srcm
@@ -2000,13 +2126,13 @@ cglobal fguv_32x32xn_i420, 0, 7, 8, 8 * mmsize + (4) * gprsize, \
 %define base r5-pb_mask
     mov             r5m, r5
 %else
-cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
-                                      grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
+cglobal fguv_32x32xn_i%1, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
+                                     grain_lut, tmp, sby, luma, lstride, uv_pl, is_id
     lea              r8, [pb_mask]
 %define base r8-pb_mask
 %endif
     mov             r6d, [fg_dataq+FGData.scaling_shift]
-    movd             m2, [base+byte_blend+3]
+    pcmpeqw          m2, m2
     movd             m3, [base+mul_bits+r6*2-14]
     mov             r6d, [fg_dataq+FGData.clip_to_restricted_range]
     lea            tmpd, [r6d*2]
@@ -2018,6 +2144,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     movd             m5, [base+min+r6*2]
     cmovne          r6d, tmpd
     movd             m4, [base+max+r6*2]
+    psrldq           m2, 14+%2
     punpcklwd        m3, m3
     punpcklwd        m5, m5
     punpcklwd        m4, m4
@@ -2032,7 +2159,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     cmp byte [fg_dataq+FGData.chroma_scaling_from_luma], 0
     jne .csfl
 
-%macro FGUV_32x32xN_LOOP 1 ; not-csfl
+%macro %%FGUV_32x32xN_LOOP 3 ; not-csfl, ss_hor, ss_ver
 %if ARCH_X86_32
     DEFINE_ARGS dst, src, scaling, sby, fg_data, picptr, overlap
 %else
@@ -2058,10 +2185,18 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     test       overlapd, overlapd
     jz %%no_vertical_overlap
 %if ARCH_X86_32
+%if %2
     movd             m1, [base+pb_23_22]
+%else
+    movd             m1, [base+pb_27_17_17_27]
+%endif
     mova             m0, [base+pw_1024]
 %else
+%if %2
     movd             m1, [pb_23_22]
+%else
+    movd             m1, [pb_27_17_17_27]
+%endif
     mova             m0, [pw_1024]
 %endif
     pshufd           m1, m1, q0000
@@ -2091,7 +2226,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %define luma_bakq lumaq
 
     mov              wq, r4m
+%if %3
     shl           r10mp, 1
+%endif
 %else
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 unused2, unused3, see, overlap, unused4, src_bak, lstride, luma_bak
@@ -2101,7 +2238,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov           lumaq, r9mp
     lea        src_bakq, [srcq+wq]
-    lea       luma_bakq, [lumaq+wq*2]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
     neg              wq
     sub            r0mp, srcq
 %if ARCH_X86_32
@@ -2112,7 +2249,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
 %else
     mov           r11mp, src_bakq
-    mov           r10mp, strideq
+    mov           r12mp, strideq
 %endif
 
 %%loop_x:
@@ -2141,8 +2278,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     ror           offyd, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+(3+(6>>%2))]  ; offy*stride+offx
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2151,6 +2288,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
                 h, offxy, see, overlap, unused1, unused2, lstride, luma_bak
 %endif
 
+%%loop_x_odd:
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
 %%loop_y:
@@ -2158,6 +2296,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2175,9 +2314,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2189,6 +2339,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2239,8 +2392,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     add            srcq, r2mp
     ; we already incremented lumaq above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
 %endif
     add      grain_lutq, 82
     dec              hw
@@ -2259,11 +2416,26 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
+%endif
+%if %2 == 0
+    ; adjust top_offxy
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jc %%loop_x_even
+    test      dword r8m, 2
+    jz %%loop_x_odd
+    jmp %%loop_x_odd_v_overlap
+%%loop_x_even:
 %endif
     test      dword r8m, 1
     jz %%loop_x
@@ -2275,8 +2447,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     ; horizontal overlap (without vertical overlap)
 %%loop_x_h_overlap:
 %if ARCH_X86_32
+%if %2
     lea              r6, [offxyd+16]
     mov [rsp+8*mmsize+0*gprsize], r6
+%else
+    mov [rsp+8*mmsize+0*gprsize], offxyd
+%endif
 
     DEFINE_ARGS luma, src, scaling, see, w, picptr, grain_lut
 
@@ -2285,7 +2461,11 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 offx, offy, see, left_offxy, unused1, unused2, lstride
 
+%if %2
     lea     left_offxyd, [offyd+16]         ; previous column's offy*stride+offx
+%else
+    mov     left_offxyd, offyd
+%endif
 %endif
     mov             r6d, seed
     or             seed, 0xEFF4
@@ -2310,8 +2490,8 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     ror           offyd, 8
     shr           offxd, 12
     and           offyd, 0xf
-    imul          offyd, 82
-    lea           offyq, [offyq+offxq+498]  ; offy*stride+offx
+    imul          offyd, 164>>%3
+    lea           offyq, [offyq+offxq*(2-%2)+(3+(6>>%3))*82+3+(6>>%2)]  ; offy*stride+offx
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
@@ -2327,6 +2507,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2344,9 +2525,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2358,6 +2550,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2422,8 +2617,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     add            srcq, r2mp
     ; lumaq has already been incremented above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
 %endif
     add      grain_lutq, 82
     dec              hw
@@ -2442,17 +2641,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
+%if %2 == 0
+    xor       dword r8m, 4
+    ; adjust top_offxyd
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add            r11d, 16
+%endif
+    add          offxyd, 16
+%endif
 
     ; r8m = sbym
     test      dword r8m, 2
+%if %2
     jne %%loop_x_hv_overlap
     jmp %%loop_x_h_overlap
+%else
+    jne %%loop_x_odd_v_overlap
+    jmp %%loop_x_odd
+%endif
 
 %%end:
     RET
@@ -2487,7 +2701,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov             r3m, seed
     mov              wq, r4m
+%if %3
     shl           r10mp, 1
+%endif
 %else
     xor            seed, sbyd               ; (cur_seed << 16) | top_seed
 
@@ -2499,7 +2715,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov           lumaq, r9mp
     lea        src_bakq, [srcq+wq]
-    lea       luma_bakq, [lumaq+wq*2]
+    lea       luma_bakq, [lumaq+wq*(1+%2)]
     neg              wq
     sub            r0mp, srcq
 %if ARCH_X86_32
@@ -2510,7 +2726,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS tmp, src, scaling, see, unused1, picptr, unused2
 %else
     mov           r11mp, src_bakq
-    mov           r10mp, strideq
+    mov           r12mp, strideq
 %endif
 
 %%loop_x_v_overlap:
@@ -2549,9 +2765,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     ror           offxd, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
 %if ARCH_X86_32
     DEFINE_ARGS tmp, src, scaling, offxy, h, picptr, top_offxy
@@ -2568,12 +2784,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS luma, src, scaling, offxy, h, picptr, grain_lut
 %endif
 
+%%loop_x_odd_v_overlap:
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_27_17]
+%else
+    mova             m1, [pb_27_17]
+%endif
 %%loop_y_v_overlap:
 %if ARCH_X86_32
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2591,9 +2815,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2605,6 +2840,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[luma_src]
@@ -2615,10 +2853,10 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     vpgatherdw       m7, m4, scalingq, r12, r2
     vpgatherdw       m5, m6, scalingq, r12, r2
 %endif
-    pcmpeqw          m1, m1
-    psrlw            m1, 8
-    pand             m7, m1
-    pand             m5, m1
+    pcmpeqw          m4, m4
+    psrlw            m4, 8
+    pand             m7, m4
+    pand             m5, m4
 
     ; grain = grain_lut[offy+y][offx+x]
     movu             m3, [grain_lutq+offxyq]
@@ -2628,17 +2866,22 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %else
     movu             m4, [grain_lutq+top_offxyq]
 %endif
-    punpckhbw        m1, m4, m3
+    punpckhbw        m6, m4, m3
     punpcklbw        m4, m3
-    pmaddubsw        m2, m9, m1
+%if %3
+    pmaddubsw        m2, m9, m6
     pmaddubsw        m3, m9, m4
+%else
+    pmaddubsw        m2, m1, m6
+    pmaddubsw        m3, m1, m4
+%endif
     pmulhrsw         m2, m8
     pmulhrsw         m3, m8
     packsswb         m3, m2
-    pxor             m1, m1
-    pcmpgtb          m1, m3
-    punpcklbw        m2, m3, m1
-    punpckhbw        m3, m1
+    pxor             m6, m6
+    pcmpgtb          m6, m3
+    punpcklbw        m2, m3, m6
+    punpckhbw        m3, m6
 
     ; noise = round2(scaling[luma_src] * grain, scaling_shift)
     pmullw           m2, m7
@@ -2648,7 +2891,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     ; unpack chroma_source
     pxor             m4, m4
-    punpckhbw        m1, m0, m4
+    punpckhbw        m6, m0, m4
     punpcklbw        m0, m4                 ; m0-1: src as word
 
 %if ARCH_X86_32
@@ -2657,12 +2900,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     ; dst = clip_pixel(src, noise)
     paddw            m0, m2
-    paddw            m1, m3
+    paddw            m6, m3
     pmaxsw           m0, m13
-    pmaxsw           m1, m13
+    pmaxsw           m6, m13
     pminsw           m0, m12
-    pminsw           m1, m12
-    packuswb         m0, m1
+    pminsw           m6, m12
+    packuswb         m0, m6
     movifnidn      dstq, dstmp
     mova    [dstq+srcq], m0
 
@@ -2672,10 +2915,24 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     add            srcq, r2mp
     ; lumaq has already been incremented above
 %else
-    add            srcq, r10mp
+    add            srcq, r12mp
+%if %3
     lea           lumaq, [lumaq+lstrideq*2]
+%else
+    add           lumaq, lstrideq
+%endif
 %endif
     add      grain_lutq, 82
+%if %3 == 0
+    btc              hd, 16
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m1, [base+pb_17_27]
+%else
+    mova             m1, [pb_17_27]
+%endif
+    jnc %%loop_y_v_overlap
+%endif
     jmp %%loop_y
 
 %%end_y_v_overlap:
@@ -2692,25 +2949,40 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
 
+%if %2
     ; since fg_dataq.overlap is guaranteed to be set, we never jump
     ; back to .loop_x_v_overlap, and instead always fall-through to
     ; h+v overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    btc       dword r8m, 2
+    jnc %%loop_x_odd_v_overlap
+%endif
 
 %%loop_x_hv_overlap:
 %if ARCH_X86_32
     DEFINE_ARGS tmp, src, scaling, offxy, w, picptr, unused
 
     mov              r6, [rsp+8*mmsize+1*gprsize]
+%if %2
     lea              r0, [r3d+16]
     add              r6, 16
     mov [rsp+8*mmsize+0*gprsize], r0        ; left_offxy
+%else
+    mov [rsp+8*mmsize+0*gprsize], r3        ; left_offxy
+%endif
     mov [rsp+8*mmsize+2*gprsize], r6        ; topleft_offxy
 
     DEFINE_ARGS tmp, src, scaling, see, w, picptr, unused
@@ -2721,8 +2993,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     DEFINE_ARGS dst, src, stride, luma, w, scaling, grain_lut, \
                 tmp, offxy, see, left_offxy, top_offxy, topleft_offxy, lstride
 
+%if %2
     lea  topleft_offxyq, [top_offxyq+16]
     lea     left_offxyq, [offxyq+16]
+%else
+    mov  topleft_offxyq, top_offxyq
+    mov     left_offxyq, offxyq
+%endif
 
     ; we assume from the block above that bits 8-15 of tmpd are zero'ed
 %endif
@@ -2756,9 +3033,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     ror           offxd, 12
     and           offyd, 0xf000f
     and           offxd, 0xf000f
-    imul          offyd, 82
+    imul          offyd, 164>>%3
     ; offxy=offy*stride+offx, (cur_offxy << 16) | top_offxy
-    lea           offyq, [offyq+offxq+0x10001*498+16*82]
+    lea           offyq, [offyq+offxq*(2-%2)+0x10001*((3+(6>>%3))*82+3+(6>>%2))+(32>>%3)*82]
 
 %if ARCH_X86_32
     DEFINE_ARGS top_offxy, src, scaling, offxy, h, picptr, grain_lut
@@ -2775,6 +3052,12 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov              hd, r7m
     mov      grain_lutq, grain_lutmp
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_27_17]
+%else
+    mova             m3, [pb_27_17]
+%endif
 %%loop_y_hv_overlap:
     ; src
 %if ARCH_X86_32
@@ -2782,6 +3065,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     mov           lumaq, r9mp
 %endif
+%if %2
     mova             m4, [lumaq+ 0]
     mova             m6, [lumaq+16]
     mova             m0, [srcq]
@@ -2799,9 +3083,20 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     pmaddubsw        m6, m7
     pavgw            m4, m2
     pavgw            m6, m2
+%else
+    mova             m4, [lumaq]
+    mova             m0, [srcq]
+%if ARCH_X86_32
+    add           lumaq, r10mp
+    mov            r9mp, lumaq
+%endif
+    pxor             m2, m2
+%endif
 
 %if %1
+%if %2
     packuswb         m4, m6                 ; luma
+%endif
     punpckhbw        m6, m4, m0
     punpcklbw        m4, m0                 ; { luma, chroma }
     pmaddubsw        m6, m14
@@ -2813,6 +3108,9 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     packuswb         m4, m6                 ; pack+unpack = clip
     punpckhbw        m6, m4, m2
     punpcklbw        m4, m2
+%elif %2 == 0
+    punpckhbw        m6, m4, m2
+    punpcklbw        m4, m2
 %endif
 
     ; scaling[src]
@@ -2821,8 +3119,13 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     vpgatherdw       m5, m6, scalingq, r0, r5
 %else
     movd             m1, [grain_lutq+topleft_offxyq]
+%if %3
     vpgatherdw       m7, m4, scalingq, r2, r12
     vpgatherdw       m5, m6, scalingq, r2, r12
+%else
+    vpgatherdw       m7, m4, scalingq, r2, r13
+    vpgatherdw       m5, m6, scalingq, r2, r13
+%endif
 %endif
     pcmpeqw          m2, m2
     psrlw            m2, 8
@@ -2836,7 +3139,7 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     movd             m1, [grain_lutq+r0]
     mov              r0, [rsp+8*mmsize+0*gprsize]       ; left_offxy
 %endif
-    movu             m3, [grain_lutq+offxyq]
+    movu             m2, [grain_lutq+offxyq]
 %if ARCH_X86_32
     movu             m6, [grain_lutq+r5]
     movd             m4, [grain_lutq+r0]
@@ -2846,23 +3149,32 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %endif
     ; do h interpolation first (so top | top/left -> top, left | cur -> cur)
     punpcklbw        m1, m6
-    punpcklbw        m4, m3
+    punpcklbw        m4, m2
+%if %2
     punpcklwd        m4, m1
+%else
+    punpckldq        m4, m1
+%endif
     pmaddubsw        m1, m9, m4
     pmulhrsw         m1, m8
     packsswb         m1, m1
-    pandn            m4, m10, m3
-    pandn            m3, m10, m6
-    psrldq           m6, m1, 1
+    pandn            m4, m10, m2
+    pandn            m2, m10, m6
+    psrldq           m6, m1, 2-%2
     pand             m1, m10
     pand             m6, m10
     por              m4, m1
-    por              m3, m6
+    por              m2, m6
     ; followed by v interpolation (top | cur -> cur)
-    punpckhbw        m1, m3, m4
-    punpcklbw        m3, m4
+    punpckhbw        m1, m2, m4
+    punpcklbw        m2, m4
+%if %3
     pmaddubsw        m4, m9, m1
-    pmaddubsw        m1, m9, m3
+    pmaddubsw        m1, m9, m2
+%else
+    pmaddubsw        m4, m3, m1
+    pmaddubsw        m1, m3, m2
+%endif
     pmulhrsw         m4, m8
     pmulhrsw         m1, m8
     packsswb         m1, m4
@@ -2883,17 +3195,17 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 
     ; unpack chroma source
     pxor             m4, m4
-    punpckhbw        m3, m0, m4
+    punpckhbw        m5, m0, m4
     punpcklbw        m0, m4                 ; m0-1: src as word
 
     ; dst = clip_pixel(src, noise)
     paddw            m0, m2
-    paddw            m3, m1
+    paddw            m5, m1
     pmaxsw           m0, m13
-    pmaxsw           m3, m13
+    pmaxsw           m5, m13
     pminsw           m0, m12
-    pminsw           m3, m12
-    packuswb         m0, m3
+    pminsw           m5, m12
+    packuswb         m0, m5
     movifnidn      dstq, dstmp
     mova    [dstq+srcq], m0
 
@@ -2901,12 +3213,36 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
     add            srcq, r2mp
     ; lumaq has been adjusted above already
 %else
-    add            srcq, r10mp
-    lea           lumaq, [lumaq+lstrideq*2]
+    add            srcq, r12mp
+%if %3
+    lea           lumaq, [lumaq+lstrideq*(1+%2)]
+%else
+    add           lumaq, r10mp
+%endif
 %endif
     add      grain_lutq, 82
     dec              hw
+%if %3
     jg %%loop_y_h_overlap
+%else
+    jle %%end_y_hv_overlap
+%if ARCH_X86_32
+    mov              r5, r5m
+    mova             m3, [base+pb_17_27]
+%else
+    mova             m3, [pb_17_27]
+%endif
+    btc              hd, 16
+    jnc %%loop_y_hv_overlap
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+    jmp %%loop_y_h_overlap
+%%end_y_hv_overlap:
+%if ARCH_X86_64
+    mov        lstrideq, r10mp
+%endif
+%endif
 
 %if ARCH_X86_32
     DEFINE_ARGS luma, src, scaling, offxy, w, picptr, grain_lut
@@ -2921,18 +3257,44 @@ cglobal fguv_32x32xn_i420, 6, 15, 16, dst, src, stride, fg_data, w, scaling, \
 %else
     mov            srcq, r11mp
 %endif
-    lea           lumaq, [luma_bakq+wq*2]
+    lea           lumaq, [luma_bakq+wq*(1+%2)]
     add            srcq, wq
 %if ARCH_X86_32
     mov             r4m, wq
     mov             r9m, lumaq
 %endif
+%if %2
     jmp %%loop_x_hv_overlap
+%else
+%if ARCH_X86_32
+    add dword [rsp+8*mmsize+1*gprsize], 16
+%else
+    add      top_offxyd, 16
+%endif
+    add          offxyd, 16
+    xor       dword r8m, 4
+    jmp %%loop_x_odd_v_overlap
+%endif
 
 %%end_hv:
     RET
 %endmacro
 
-    FGUV_32x32xN_LOOP 1
+    %%FGUV_32x32xN_LOOP 1, %2, %3
 .csfl:
-    FGUV_32x32xN_LOOP 0
+    %%FGUV_32x32xN_LOOP 0, %2, %3
+%endmacro
+
+FGUV_FN 420, 1, 1
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 422, 1, 0
+
+%if STACK_ALIGNMENT < mmsize
+DECLARE_ARG 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
+%endif
+
+FGUV_FN 444, 0, 0
diff --git a/ffmpeg/JNI/dav1d/src/x86/ipred.asm b/ffmpeg/JNI/dav1d/src/x86/ipred.asm
index 155f49004..ad05b3b1f 100644
--- a/ffmpeg/JNI/dav1d/src/x86/ipred.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/ipred.asm
@@ -100,6 +100,8 @@ ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
               db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4;  0,  0,  0,  0
 pw_64:        times 2 dw 64
 
+cfl_ac_444_w16_pad1_shuffle: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1
+                             times 9 db 7, -1
 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         ; w=8, w_pad=1 as well as second half of previous one
@@ -166,6 +168,7 @@ JMP_TABLE ipred_cfl,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
 JMP_TABLE ipred_cfl_left,   avx2, h4, h8, h16, h32
 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE ipred_cfl_ac_422, avx2, w16_pad1, w16_pad2, w16_pad3
+JMP_TABLE ipred_cfl_ac_444, avx2, w32_pad1, w32_pad2, w32_pad3, w4, w8, w16, w32
 JMP_TABLE pal_pred,         avx2, w4, w8, w16, w32, w64
 
 cextern dr_intra_derivative
@@ -1409,7 +1412,6 @@ ALIGN function_align
     mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
-    popcnt              r5d, r5d ; sets ZF which can be used by caller
     ret
 .w4_no_upsample:
     %assign stack_offset org_stack_offset
@@ -1420,7 +1422,9 @@ ALIGN function_align
     lea            maxbased, [hq+3]
     call .filter_strength
     mov            maxbased, 7
+    test                r5d, r5d
     jz .w4_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m7, [base+pb_8]
     vbroadcasti128       m2, [tlq-1]
     pminub               m1, m7, [base+z_filter_s]
@@ -1593,7 +1597,9 @@ ALIGN function_align
     test             angled, 0x400
     jnz .w8_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .w8_main ; filter_strength == 0
+    popcnt              r5d, r5d
     movu                xm2, [tlq]
     pminub              xm1, xm0, [base+z_filter_s+14]
     vinserti128          m2, [tlq-1], 1
@@ -1695,7 +1701,9 @@ ALIGN function_align
     test             angled, 0x400
     jnz .w16_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .w16_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m1, [base+pb_12]
     vbroadcasti128       m6, [base+z_filter_s+8]
     vinserti128          m2, m6, [base+z_filter_s], 0
@@ -2202,7 +2210,6 @@ ALIGN function_align
     pand                 m0, m8, m7
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
     ret
 ALIGN function_align
 .upsample_above: ; w4/w8
@@ -2252,7 +2259,9 @@ ALIGN function_align
     lea                 r3d, [hq+3]
     sub              angled, 1112 ; angle - 90
     call .filter_strength
+    test                r3d, r3d
     jz .w4_no_filter_above
+    popcnt              r3d, r3d
     vpbroadcastd        xm2, [base+pb_4]
     pminub              xm2, [base+z_filter_s]
     vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2287,9 +2296,10 @@ ALIGN function_align
     pand                xm0, xm8 ; reuse from previous filter_strength call
     pcmpgtb             xm0, xm9
     pmovmskb            r3d, xm0
-    popcnt              r3d, r3d
 .w4_filter_left:
+    test                r3d, r3d
     jz .w4_main
+    popcnt              r3d, r3d
     mov                 r5d, 10
     cmp                  hd, 16
     movu                xm2, [rsp+49]
@@ -2440,7 +2450,9 @@ ALIGN function_align
     lea                 r3d, [hq+7]
     sub              angled, 90 ; angle - 90
     call .filter_strength
+    test                r3d, r3d
     jz .w8_no_filter_above
+    popcnt              r3d, r3d
     vpbroadcastd        xm3, [base+pb_8]
     pminub              xm3, [base+z_filter_s+8]
     vpbroadcastd        xm0, [base+z_filter_k-4+r3*4+12*0]
@@ -2473,9 +2485,10 @@ ALIGN function_align
     pand                 m0, m8
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
 .w8_filter_left:
+    test                r3d, r3d
     jz .w8_main
+    popcnt              r3d, r3d
     vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
     vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
     vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
@@ -2647,7 +2660,9 @@ ALIGN function_align
     lea                 r3d, [hq+15]
     sub              angled, 90
     call .filter_strength
+    test                r3d, r3d
     jz .w16_no_filter_above
+    popcnt              r3d, r3d
     vbroadcasti128       m6, [tlq+1]
     mova                xm2, [base+z_filter_s]
     vinserti128          m2, [base+z_filter_s+14], 1 ; 00 01 12 23 34 45 56 67   67 78 89 9a ab bc cd de
@@ -2680,8 +2695,9 @@ ALIGN function_align
     pand                 m0, m8
     pcmpgtb              m0, m9
     pmovmskb            r3d, m0
-    popcnt              r3d, r3d
+    test                r3d, r3d
     jz .w16_main
+    popcnt              r3d, r3d
     vpbroadcastd         m7, [base+z_filter_k-4+r3*4+12*0]
     vpbroadcastd         m8, [base+z_filter_k-4+r3*4+12*1]
     vpbroadcastd         m9, [base+z_filter_k-4+r3*4+12*2]
@@ -3083,7 +3099,6 @@ ALIGN function_align
     mova                xm2, [r4+angleq*8]
     pcmpgtb              m1, m2
     pmovmskb            r5d, m1
-    popcnt              r5d, r5d
     ret
 .h4_no_upsample:
     %assign stack_offset org_stack_offset
@@ -3094,7 +3109,9 @@ ALIGN function_align
     lea            maxbased, [wq+3]
     call .filter_strength
     mov            maxbased, 7
+    test                r5d, r5d
     jz .h4_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd         m7, [base+pb_7]
     vbroadcasti128       m2, [tlq-14]
     pmaxub               m1, m7, [base+z_filter_s-4]
@@ -3285,7 +3302,9 @@ ALIGN function_align
     test             angled, 0x400
     jnz .h8_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .h8_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd        xm6, [base+pb_15]
     pcmpeqb             xm1, xm1
     psubusb             xm6, xm0
@@ -3441,7 +3460,9 @@ ALIGN function_align
     test             angled, 0x400
     jnz .h16_no_intra_edge_filter
     call .filter_strength
+    test                r5d, r5d
     jz .h16_main ; filter_strength == 0
+    popcnt              r5d, r5d
     vpbroadcastd        m11, [base+pb_27]
     vpbroadcastd         m1, [base+pb_1]
     vbroadcasti128       m6, [base+z_filter_s+12]
@@ -5054,6 +5075,236 @@ cglobal ipred_cfl_ac_422, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
     jg .sub_loop
     RET
 
+cglobal ipred_cfl_ac_444, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_bak
+    movifnidn         hpadd, hpadm
+    movifnidn            wd, wm
+    mov                  hd, hm
+    mov                 szd, wd
+    imul                szd, hd
+    shl               hpadd, 2
+    sub                  hd, hpadd
+    pxor                 m4, m4
+    vpbroadcastd         m5, [pw_1]
+    tzcnt               r8d, wd
+    lea                  r5, [ipred_cfl_ac_444_avx2_table]
+    movsxd               r8, [r5+r8*4+12]
+    add                  r5, r8
+
+    DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h, sz, ac_bak
+    mov             ac_bakq, acq
+    jmp                  r5
+
+.w4:
+    lea            stride3q, [strideq*3]
+    pxor                xm2, xm2
+.w4_loop:
+    movd                xm1, [yq]
+    movd                xm0, [yq+strideq*2]
+    pinsrd              xm1, [yq+strideq], 1
+    pinsrd              xm0, [yq+stride3q], 1
+    punpcklbw           xm1, xm2
+    punpcklbw           xm0, xm2
+    psllw               xm1, 3
+    psllw               xm0, 3
+    mova              [acq], xm1
+    mova           [acq+16], xm0
+    paddw               xm1, xm0
+    paddw               xm4, xm1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 32
+    sub                  hd, 4
+    jg .w4_loop
+    test              hpadd, hpadd
+    jz .calc_avg_mul
+    pshufd              xm0, xm0, q3232
+    paddw               xm1, xm0, xm0
+.w4_hpad_loop:
+    mova              [acq], xm0
+    mova           [acq+16], xm0
+    paddw               xm4, xm1
+    add                 acq, 32
+    sub               hpadd, 4
+    jg .w4_hpad_loop
+    jmp .calc_avg_mul
+
+.w8:
+    lea            stride3q, [strideq*3]
+    pxor                 m2, m2
+.w8_loop:
+    movq                xm1, [yq]
+    movq                xm0, [yq+strideq*2]
+    vinserti128          m1, [yq+strideq], 1
+    vinserti128          m0, [yq+stride3q], 1
+    punpcklbw            m1, m2
+    punpcklbw            m0, m2
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    paddw                m4, m1
+    lea                  yq, [yq+strideq*4]
+    add                 acq, 64
+    sub                  hd, 4
+    jg .w8_loop
+    test              hpadd, hpadd
+    jz .calc_avg_mul
+    vpermq               m0, m0, q3232
+    paddw                m1, m0, m0
+.w8_hpad_loop:
+    mova              [acq], m0
+    mova           [acq+32], m0
+    paddw                m4, m1
+    add                 acq, 64
+    sub               hpadd, 4
+    jg .w8_hpad_loop
+    jmp .calc_avg_mul
+
+.w16:
+    test              wpadd, wpadd
+    jnz .w16_wpad
+.w16_loop:
+    pmovzxbw             m1, [yq]
+    pmovzxbw             m0, [yq+strideq]
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    pmaddwd              m1, m5
+    paddd                m4, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w16_hpad
+.w16_wpad:
+    mova                 m3, [cfl_ac_444_w16_pad1_shuffle]
+.w16_wpad_loop:
+    vpbroadcastq         m1, [yq]
+    vpbroadcastq         m0, [yq+strideq]
+    pshufb               m1, m3
+    pshufb               m0, m3
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m1, m0
+    pmaddwd              m1, m5
+    paddd                m4, m1
+    lea                  yq, [yq+strideq*2]
+    add                 acq, 64
+    sub                  hd, 2
+    jg .w16_wpad_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+.w16_hpad:
+    paddw                m1, m0, m0
+    pmaddwd              m1, m5
+.w16_hpad_loop:
+    mova              [acq], m0
+    mova           [acq+32], m0
+    paddd                m4, m1
+    add                 acq, 64
+    sub               hpadd, 2
+    jg .w16_hpad_loop
+    jmp .calc_avg
+
+.w32:
+    test              wpadd, wpadd
+    jnz .w32_wpad
+.w32_loop:
+    pmovzxbw             m1, [yq]
+    pmovzxbw             m0, [yq+16]
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m2, m1, m0
+    pmaddwd              m2, m5
+    paddd                m4, m2
+    add                  yq, strideq
+    add                 acq, 64
+    dec                  hd
+    jg .w32_loop
+    test              hpadd, hpadd
+    jz .calc_avg
+    jmp .w32_hpad_loop
+.w32_wpad:
+    DEFINE_ARGS ac, y, stride, wpad, hpad, iptr, h, sz, ac_bak
+    lea               iptrq, [ipred_cfl_ac_444_avx2_table]
+    add               wpadd, wpadd
+    mova                 m3, [iptrq+cfl_ac_444_w16_pad1_shuffle-ipred_cfl_ac_444_avx2_table]
+    movsxd            wpadq, [iptrq+wpadq+4]
+    add               iptrq, wpadq
+    jmp iptrq
+.w32_pad3:
+    vpbroadcastq         m1, [yq]
+    pshufb               m1, m3
+    vpermq               m0, m1, q3232
+    jmp .w32_wpad_end
+.w32_pad2:
+    pmovzxbw             m1, [yq]
+    pshufhw              m0, m1, q3333
+    vpermq               m0, m0, q3333
+    jmp .w32_wpad_end
+.w32_pad1:
+    pmovzxbw             m1, [yq]
+    vpbroadcastq         m0, [yq+16]
+    pshufb               m0, m3
+    ; fall-through
+.w32_wpad_end:
+    psllw                m1, 3
+    psllw                m0, 3
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddw                m2, m1, m0
+    pmaddwd              m2, m5
+    paddd                m4, m2
+    add                  yq, strideq
+    add                 acq, 64
+    dec                  hd
+    jz .w32_wpad_done
+    jmp iptrq
+.w32_wpad_done:
+    test              hpadd, hpadd
+    jz .calc_avg
+.w32_hpad_loop:
+    mova              [acq], m1
+    mova           [acq+32], m0
+    paddd                m4, m2
+    add                 acq, 64
+    dec               hpadd
+    jg .w32_hpad_loop
+    jmp .calc_avg
+
+.calc_avg_mul:
+    pmaddwd              m4, m5
+.calc_avg:
+    vextracti128        xm1, m4, 1
+    tzcnt               r1d, szd
+    paddd               xm0, xm4, xm1
+    movd                xm2, r1d
+    movd                xm3, szd
+    punpckhqdq          xm1, xm0, xm0
+    paddd               xm0, xm1
+    psrad               xm3, 1
+    psrlq               xm1, xm0, 32
+    paddd               xm0, xm3
+    paddd               xm0, xm1
+    psrad               xm0, xm2
+    vpbroadcastw         m0, xm0
+.sub_loop:
+    mova                 m1, [ac_bakq]
+    psubw                m1, m0
+    mova          [ac_bakq], m1
+    add             ac_bakq, 32
+    sub                 szd, 16
+    jg .sub_loop
+    RET
+
 cglobal pal_pred, 4, 6, 5, dst, stride, pal, idx, w, h
     vbroadcasti128       m4, [palq]
     lea                  r2, [pal_pred_avx2_table]
diff --git a/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c
index 28f6f324d..4219ab8b1 100644
--- a/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/x86/ipred_init_tmpl.c
@@ -50,6 +50,7 @@ decl_cfl_pred_fn(dav1d_ipred_cfl_left_avx2);
 
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_420_avx2);
 decl_cfl_ac_fn(dav1d_ipred_cfl_ac_422_avx2);
+decl_cfl_ac_fn(dav1d_ipred_cfl_ac_444_avx2);
 
 decl_pal_pred_fn(dav1d_pal_pred_avx2);
 
@@ -131,6 +132,7 @@ COLD void bitfn(dav1d_intra_pred_dsp_init_x86)(Dav1dIntraPredDSPContext *const c
 
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = dav1d_ipred_cfl_ac_420_avx2;
     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = dav1d_ipred_cfl_ac_422_avx2;
+    c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = dav1d_ipred_cfl_ac_444_avx2;
 
     c->pal_pred = dav1d_pal_pred_avx2;
 #endif
diff --git a/ffmpeg/JNI/dav1d/src/x86/itx.asm b/ffmpeg/JNI/dav1d/src/x86/itx.asm
index e964070b1..f27b90032 100644
--- a/ffmpeg/JNI/dav1d/src/x86/itx.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/itx.asm
@@ -27,15 +27,10 @@
 
 %if ARCH_X86_64
 
-SECTION_RODATA 32
+SECTION_RODATA 16
 
 ; Note: The order of (at least some of) those constants matter!
 
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-iadst4_dconly1a: dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: dw 30424, 26752, 19856, 10568
-
 deint_shuf: db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
 
 %macro COEF_PAIR 2
@@ -132,7 +127,7 @@ SECTION .text
 ; mandatory 4-byte offsets everywhere, we can set up a base pointer with a
 ; single rip-relative lea and then address things relative from that with
 ; 1-byte offsets as long as data is within +-128 bytes of the base pointer.
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
 %define o(x) (rax - (o_base) + (x))
 
 %macro REPX 2-*
@@ -180,16 +175,16 @@ SECTION .text
     vpbroadcastd        m%3, [o(pw_%8_%9)]
     vpbroadcastd        m%4, [o(pw_m%9_%8)]
     vpbroadcastd       xm%2, [o(pw_%6_%7)]
-    vpblendd            m%2, m%2, m%3, 0xf0
+    vpblendd            m%2, m%3, 0xf0
     vpbroadcastd       xm%3, [o(pw_m%7_%6)]
 %else
     vpbroadcastd        m%3, [o(pw_m%9_%8)]
     vpbroadcastd        m%4, [o(pw_%8_%9)]
     vpbroadcastd       xm%2, [o(pw_m%7_%6)]
-    vpblendd            m%2, m%2, m%3, 0xf0
+    vpblendd            m%2, m%3, 0xf0
     vpbroadcastd       xm%3, [o(pw_%6_%7)]
 %endif
-    vpblendd            m%3, m%3, m%4, 0xf0
+    vpblendd            m%3, m%4, 0xf0
     ITX_MUL2X_PACK       %1, %4, _, %5, %2, %3, (4|%10)
 %endmacro
 
@@ -360,21 +355,17 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, c
     punpckhdq            m1, m0, m3
     punpckldq            m0, m3
     IWHT4_1D_PACKED
-    vpblendd             m0, m0, m2, 0x03
+    vpblendd             m0, m2, 0x03
     ITX4_END              3, 0, 2, 1, 0
 
-%macro INV_TXFM_FN 4 ; type1, type2, fast_thresh, size
-cglobal inv_txfm_add_%1_%2_%4, 4, 5, 0, dst, stride, c, eob, tx2
-    %undef cmp
-    %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 3 ; type1, type2, size
+cglobal inv_txfm_add_%1_%2_%3, 4, 5, 0, dst, stride, c, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
     lea                 rax, [o_base]
     ; Jump to the 1st txfm function if we're not taking the fast path, which
     ; in turn performs an indirect jump to the 2nd txfm function.
-    lea tx2q, [m(i%2_%4_internal).pass2]
-%if %3 > 0
-    cmp                eobd, %3
-    jg %%p1
-%elif %3 == 0
+    lea                tx2q, [m(i%2_%3_internal).pass2]
+%ifidn %1_%2, dct_dct
     test               eobd, eobd
     jnz %%p1
 %else
@@ -385,54 +376,16 @@ ALIGN function_align
 %endif
 %endmacro
 
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4
-%ifidn %1_%2, dct_identity
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [cq]
-    vpbroadcastd         m1, [o(pw_1697x8)]
-    pmulhrsw             m1, m0
-    paddsw               m0, m1
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    jmp m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
-    mova                 m0, [cq+16*0]
-    packusdw             m0, [cq+16*1]
-    vpbroadcastd         m1, [o(pw_1697x8)]
-    vpbroadcastd         m2, [o(pw_2896x8)]
-    packusdw             m0, m0
-    pmulhrsw             m1, m0
-    paddsw               m0, m1
-    pmulhrsw             m0, m2
-    mova                 m1, m0
-    jmp m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4
+%ifidn %1_%2, dct_dct
     vpbroadcastw         m0, [cq]
-%ifidn %1, dct
     vpbroadcastd         m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
-%elifidn %1, adst
-    movddup              m1, [o(iadst4_dconly1a)]
-    pmulhrsw             m0, m1
-%elifidn %1, flipadst
-    movddup              m1, [o(iadst4_dconly1b)]
-    pmulhrsw             m0, m1
-%endif
     mov                [cq], eobd ; 0
-%ifidn %2, dct
-%ifnidn %1, dct
-    vpbroadcastd         m1, [o(pw_2896x8)]
-%endif
     pmulhrsw             m0, m1
     mova                 m1, m0
     jmp m(iadst_4x4_internal).end2
-%else ; adst / flipadst
-    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    jmp m(i%2_4x4_internal).end2
-%endif
 %endif
 %endmacro
 
@@ -477,10 +430,10 @@ ALIGN function_align
     packssdw             m1, m2 ; out2 out3
 %endmacro
 
-INV_TXFM_4X4_FN dct, dct,      0
-INV_TXFM_4X4_FN dct, adst,     0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
 
 cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova                 m0, [cq+16*0]
@@ -488,7 +441,7 @@ cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     IDCT4_1D_PACKED
     mova                 m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
-    shufps               m0, m0, m1, q0220
+    shufps               m0, m1, q0220
     pshufb               m0, m2
     pshufb               m1, m3, m2
     jmp                tx2q
@@ -499,9 +452,9 @@ cglobal idct_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     mova          [cq+16*1], m2
     ITX4_END              0, 1, 3, 2
 
-INV_TXFM_4X4_FN adst, dct,      0
-INV_TXFM_4X4_FN adst, adst,     0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
 INV_TXFM_4X4_FN adst, identity
 
 cglobal iadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -526,9 +479,9 @@ ALIGN function_align
     IADST4_1D_PACKED
     ret
 
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
 INV_TXFM_4X4_FN flipadst, identity
 
 cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
@@ -549,7 +502,7 @@ cglobal iflipadst_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, dct
 INV_TXFM_4X4_FN identity, adst
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
@@ -600,38 +553,9 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     pextrd [r2  +r3       ], xm5, 3
 %endmacro
 
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm1, xm0, [cq]
-    vpbroadcastd        xm2, [o(pw_4096)]
-    pmulhrsw            xm1, xm0
-    pmulhrsw            xm1, xm2
-    vpermq               m1, m1, q1100
-    punpcklwd            m1, m1
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-    jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+16*0]
-    punpcklwd           xm0, [cq+16*1]
-    movd                xm1, [cq+16*2]
-    punpcklwd           xm1, [cq+16*3]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    vpbroadcastd        xm3, [o(pw_1697x8)]
-    vpbroadcastd        xm4, [o(pw_2048)]
-    punpckldq           xm0, xm1
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm3, xm0
-    paddsw              xm0, xm3
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm4
-    vpbroadcastq         m0, xm0
-    mova                 m1, m0
-    jmp m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_2048)]
@@ -641,24 +565,7 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     pmulhrsw            xm0, xm2
     vpbroadcastw         m0, xm0
     mova                 m1, m0
-    jmp m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
-    vpbroadcastw        xm0, [cq]
-    vpbroadcastd        xm1, [o(pw_2896x8)]
-    pmulhrsw            xm0, xm1
-    pmulhrsw            xm0, [o(iadst4_dconly1a)]
-    vpbroadcastd        xm2, [o(pw_2048)]
-    mov                [cq], eobd
-    pmulhrsw            xm0, xm1
-    pmulhrsw            xm0, xm2
-%ifidn %1, adst
-    vpbroadcastq         m0, xm0
-%else ; flipadst
-    vpermq               m0, m0, q1111
-%endif
-    mova                 m1, m0
-    jmp m(iadst_4x8_internal).end4
-%endif
+    jmp m(iadst_4x8_internal).end3
 %endif
 %endmacro
 
@@ -760,9 +667,9 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
     paddsw               m4, m5            ; out6 -out1
     vpbroadcastd         m5, [o(pw_2896x8)]
     vpblendd             m3, m0, m4, 0x33  ; out6 -out7
-    vpblendd             m0, m0, m4, 0xcc  ; out0 -out1
+    vpblendd             m0, m4, 0xcc      ; out0 -out1
     shufps               m4, m2, m1, q1032 ; t3 t7
-    vpblendd             m1, m2, m1, 0xcc  ; t2 t6
+    vpblendd             m1, m2, 0x33      ; t2 t6
     psubsw               m2, m1, m4        ; t2-t3 t6-t7
     paddsw               m1, m4            ; t2+t3 t6+t7
     pmulhrsw             m2, m5            ; out4 -out5
@@ -772,10 +679,10 @@ cglobal iidentity_4x4_internal, 0, 5, 6, dst, stride, c, eob, tx2
 %endmacro
 
 INIT_YMM avx2
-INV_TXFM_4X8_FN dct, dct,      0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
 INV_TXFM_4X8_FN dct, adst
 INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
 
 cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120
@@ -786,7 +693,7 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     IDCT4_1D_PACKED
     vbroadcasti128       m2, [o(deint_shuf)]
     shufps               m3, m0, m1, q1331
-    shufps               m0, m0, m1, q0220
+    shufps               m0, m1, q0220
     pshufb               m0, m2
     pshufb               m1, m3, m2
     jmp                tx2q
@@ -795,8 +702,8 @@ cglobal idct_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vextracti128        xm3, m1, 1
     call .main
     vpbroadcastd         m4, [o(pw_2048)]
-    vinserti128          m0, m0, xm2, 1
-    vinserti128          m1, m1, xm3, 1
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
     pshufd               m1, m1, q1032
     jmp m(iadst_4x8_internal).end2
 ALIGN function_align
@@ -804,7 +711,7 @@ ALIGN function_align
     WRAP_XMM IDCT8_1D_PACKED
     ret
 
-INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, dct
 INV_TXFM_4X8_FN adst, adst
 INV_TXFM_4X8_FN adst, flipadst
 INV_TXFM_4X8_FN adst, identity
@@ -828,21 +735,20 @@ cglobal iadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pshufd              xm5, xm1, q1032
     call .main_pass2
     vpbroadcastd         m4, [o(pw_2048)]
-    vinserti128          m0, m0, xm2, 1
-    vinserti128          m1, m1, xm3, 1
+    vinserti128          m0, xm2, 1
+    vinserti128          m1, xm3, 1
     pxor                 m5, m5
     psubw                m5, m4
 .end:
-    vpblendd             m4, m4, m5, 0xcc
+    vpblendd             m4, m5, 0xcc
 .end2:
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
     WIN64_RESTORE_XMM
-.end3:
     pxor                 m2, m2
     mova          [cq+32*0], m2
     mova          [cq+32*1], m2
-.end4:
+.end3:
     lea                  r2, [dstq+strideq*4]
     lea                  r3, [strideq*3]
     WRITE_4X8             0, 1
@@ -856,7 +762,7 @@ ALIGN function_align
     WRAP_XMM IADST8_1D_PACKED 2
     ret
 
-INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, dct
 INV_TXFM_4X8_FN flipadst, adst
 INV_TXFM_4X8_FN flipadst, flipadst
 INV_TXFM_4X8_FN flipadst, identity
@@ -880,15 +786,15 @@ cglobal iflipadst_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pshufd              xm5, xm1, q1032
     call m(iadst_4x8_internal).main_pass2
     vpbroadcastd         m5, [o(pw_2048)]
-    vinserti128          m3, m3, xm1, 1
-    vinserti128          m2, m2, xm0, 1
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
     pxor                 m4, m4
     psubw                m4, m5
     pshufd               m0, m3, q1032
     pshufd               m1, m2, q1032
     jmp m(iadst_4x8_internal).end
 
-INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, dct
 INV_TXFM_4X8_FN identity, adst
 INV_TXFM_4X8_FN identity, flipadst
 INV_TXFM_4X8_FN identity, identity
@@ -913,49 +819,9 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd         m4, [o(pw_4096)]
     jmp m(iadst_4x8_internal).end2
 
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x16
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [cq]
-    vpbroadcastd         m1, [o(pw_16384)]
-    vpbroadcastd         m2, [o(pw_1697x16)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m0
-    paddsw               m0, m0
-    paddsw               m0, m2
-    pmulhrsw             m3, m0
-    punpcklwd            m1, m3, m3
-    punpckhwd            m3, m3
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-    punpckldq            m2, m3, m3
-    punpckhdq            m3, m3
-    jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+32*0]
-    punpcklwd           xm0, [cq+32*1]
-    movd                xm1, [cq+32*2]
-    punpcklwd           xm1, [cq+32*3]
-    vpbroadcastd        xm2, [o(pw_1697x8)]
-    vpbroadcastd        xm3, [o(pw_2896x8)]
-    vpbroadcastd        xm4, [o(pw_2048)]
-    punpckldq           xm0, xm1
-    pcmpeqw             xm1, xm1
-    pmulhrsw            xm2, xm0
-    pcmpeqw             xm1, xm0
-    pxor                xm0, xm1
-    pavgw               xm0, xm2
-    pmulhrsw            xm0, xm3
-    pmulhrsw            xm0, xm4
-    vpbroadcastq         m0, xm0
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    jmp m(iadst_4x16_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_16384)]
@@ -968,27 +834,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    jmp m(iadst_4x16_internal).end4
-%else ; adst_dct / flipadst_dct
-    vpbroadcastw        xm0, [cq]
-    pmulhrsw            xm0, [o(iadst4_dconly1a)]
-    vpbroadcastd        xm1, [o(pw_16384)]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    mov                [cq], eobd
-    pmulhrsw            xm0, xm1
-    psrlw               xm1, 3 ; pw_2048
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm1
-%ifidn %1, adst
-    vpbroadcastq         m0, xm0
-%else ; flipadst
-    vpermq               m0, m0, q1111
-%endif
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    jmp m(iadst_4x16_internal).end4
-%endif
+    jmp m(iadst_4x16_internal).end3
 %endif
 %endmacro
 
@@ -1038,7 +884,7 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd         m5, [o(pw_2896_2896)]
     ITX_MUL2X_PACK        1, 0, _, 10, 0, 5, 4    ; t6   t5
     vpbroadcastd         m0, [o(pw_m2896_2896)]
-    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4,   ; t13a t10a
+    ITX_MUL2X_PACK        2, 0, _, 10, 0, 5, 4    ; t13a t10a
     punpckhqdq           m0, m8, m3        ; t15a t14
     punpcklqdq           m8, m3            ; t8a  t9
     shufps               m5, m4, m2, q1032 ; t12  t13a
@@ -1061,10 +907,10 @@ cglobal iidentity_4x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     paddsw               m3, m8     ; out7  out6
 %endmacro
 
-INV_TXFM_4X16_FN dct, dct,      0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
 
 cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                 m0, [cq+32*0]
@@ -1089,11 +935,11 @@ cglobal idct_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     vextracti128        xm6, m2, 1
     vextracti128        xm7, m3, 1
     call .main
-    vinserti128          m0, m0, xm4, 1
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m0, xm4, 1
+    vinserti128          m1, xm5, 1
     vpbroadcastd         m5, [o(pw_2048)]
-    vinserti128          m2, m2, xm6, 1
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m2, xm6, 1
+    vinserti128          m3, xm7, 1
     pshufd               m1, m1, q1032
     pshufd               m3, m3, q1032
     jmp m(iadst_4x16_internal).end2
@@ -1102,7 +948,7 @@ ALIGN function_align
     WRAP_XMM IDCT16_1D_PACKED
     ret
 
-INV_TXFM_4X16_FN adst, dct,      0
+INV_TXFM_4X16_FN adst, dct
 INV_TXFM_4X16_FN adst, adst
 INV_TXFM_4X16_FN adst, flipadst
 INV_TXFM_4X16_FN adst, identity
@@ -1134,26 +980,25 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     vpbroadcastd         m5, [o(pw_2048)]
     pshufd               m1, m1, q1032
     vpblendd             m4, m1, m0, 0x33
-    vpblendd             m0, m0, m2, 0x33
-    vpblendd             m2, m2, m3, 0x33
-    vpblendd             m3, m3, m1, 0x33
+    vpblendd             m0, m2, 0x33
+    vpblendd             m2, m3, 0x33
+    vpblendd             m3, m1, 0x33
     vpermq               m0, m0, q2031
     vpermq               m1, m2, q1302
     vpermq               m2, m3, q3120
     vpermq               m3, m4, q0213
     psubw                m6, m7, m5
 .end:
-    vpblendd             m5, m5, m6, 0xcc
+    vpblendd             m5, m6, 0xcc
 .end2:
     REPX   {pmulhrsw x, m5}, m0, m1, m2, m3
     WIN64_RESTORE_XMM
-.end3:
     pxor                 m4, m4
     mova          [cq+32*0], m4
     mova          [cq+32*1], m4
     mova          [cq+32*2], m4
     mova          [cq+32*3], m4
-.end4:
+.end3:
     lea                  r2, [dstq+strideq*8]
     lea                  r3, [strideq*3]
     WRITE_4X8             0, 1
@@ -1164,9 +1009,9 @@ cglobal iadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
 ALIGN function_align
 .main:
     vpblendd             m4, m1, m0, 0xcc
-    vpblendd             m1, m1, m0, 0x33
+    vpblendd             m1, m0, 0x33
     vpblendd             m5, m2, m3, 0xcc
-    vpblendd             m2, m2, m3, 0x33
+    vpblendd             m2, m3, 0x33
     vperm2i128           m3, m5, m2, 0x31
     vinserti128          m0, m1, xm4, 1 ; in0  in3  in2  in1
     vperm2i128           m4, m1, m4, 0x31
@@ -1198,23 +1043,23 @@ ALIGN function_align
     psubsw               m1, m2, m3 ; t13a t12a t15a t14a
     paddsw               m2, m3     ; t9a  t8a  t11a t10a
     psubw                m3, m7, m6 ; pw_3784_m1567
-    vpblendd             m6, m6, m3, 0xf0
+    vpblendd             m6, m3, 0xf0
     ITX_MUL2X_PACK        4, 3, _, 8, 6, 5, 4 ; t4a t5a t7a t6a
     ITX_MUL2X_PACK        1, 3, _, 8, 6, 5, 4 ; t12 t13 t15 t14
     vbroadcasti128       m5, [o(deint_shuf)]
     pshufb               m0, m5
     pshufb               m2, m5
     vperm2i128           m3, m0, m2, 0x31  ; t3   t2   t11a t10a
-    vinserti128          m0, m0, xm2, 1    ; t1   t0   t9a  t8a
+    vinserti128          m0, xm2, 1        ; t1   t0   t9a  t8a
     vperm2i128           m2, m4, m1, 0x31  ; t7a  t6a  t15  t14
-    vinserti128          m4, m4, xm1, 1    ; t4a  t5a  t12  t13
+    vinserti128          m4, xm1, 1        ; t4a  t5a  t12  t13
     pshufd               m2, m2, q1032     ; t6a  t7a  t14  t15
     psubsw               m1, m0, m3        ; t3a t2a t11 t10
     paddsw               m0, m3     ; -out15  out0   out14 -out1
     paddsw               m3, m4, m2 ; -out3   out12  out2  -out13
     psubsw               m4, m2            ; t6 t7 t14a t15a
     shufps               m2, m1, m4, q1032 ; t2a t6  t10 t14a
-    vpblendd             m4, m4, m1, 0x33  ; t3a t7  t11 t15a
+    vpblendd             m4, m1, 0x33      ; t3a t7  t11 t15a
     ret
 ALIGN function_align
 .main_pass1_end:
@@ -1232,7 +1077,7 @@ ALIGN function_align
     packssdw             m1, m4     ; -out7   out4   out6  -out5
     ret
 
-INV_TXFM_4X16_FN flipadst, dct,      0
+INV_TXFM_4X16_FN flipadst, dct
 INV_TXFM_4X16_FN flipadst, adst
 INV_TXFM_4X16_FN flipadst, flipadst
 INV_TXFM_4X16_FN flipadst, identity
@@ -1264,9 +1109,9 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     vpbroadcastd         m6, [o(pw_2048)]
     pshufd               m1, m1, q1032
     vpblendd             m4, m0, m2, 0x33
-    vpblendd             m0, m0, m1, 0xcc
-    vpblendd             m1, m1, m3, 0xcc
-    vpblendd             m2, m2, m3, 0x33
+    vpblendd             m0, m1, 0xcc
+    vpblendd             m1, m3, 0xcc
+    vpblendd             m2, m3, 0x33
     vpermq               m0, m0, q3120
     vpermq               m1, m1, q0213
     vpermq               m2, m2, q2031
@@ -1274,7 +1119,7 @@ cglobal iflipadst_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     psubw                m5, m7, m6
     jmp m(iadst_4x16_internal).end
 
-INV_TXFM_4X16_FN identity, dct,      3
+INV_TXFM_4X16_FN identity, dct
 INV_TXFM_4X16_FN identity, adst
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
@@ -1325,7 +1170,7 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     paddsw               m3, m8
     jmp m(iadst_4x16_internal).end2
 
-%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3, ; coefs[1-2], tmp[1-2], off[1-3]
+%macro WRITE_8X4 4-7 strideq*1, strideq*2, r3 ; coefs[1-2], tmp[1-2], off[1-3]
     movq               xm%3, [dstq   ]
     movhps             xm%3, [dstq+%5]
     movq               xm%4, [dstq+%6]
@@ -1350,69 +1195,25 @@ cglobal iidentity_4x16_internal, 0, 5, 11, dst, stride, c, eob, tx2
     movhps        [dstq+%7], xm%4
 %endmacro
 
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm1, xm0, [cq]
-    vpbroadcastd        xm2, [o(pw_1697x8)]
-    vpbroadcastd        xm3, [o(pw_2048)]
-    pmulhrsw            xm1, xm0
-    pmulhrsw            xm2, xm1
-    paddsw              xm1, xm2
-    pmulhrsw            xm1, xm3
-    punpcklwd           xm1, xm1
-    punpckldq           xm0, xm1, xm1
-    punpckhdq           xm1, xm1
-    vpermq               m0, m0, q1100
-    vpermq               m1, m1, q1100
-%elifidn %1_%2, identity_dct
-    mova                xm0, [cq+16*0]
-    packusdw            xm0, [cq+16*1]
-    mova                xm1, [cq+16*2]
-    packusdw            xm1, [cq+16*3]
-    vpbroadcastd        xm2, [o(pw_2896x8)]
-    vpbroadcastd        xm3, [o(pw_2048)]
-    packusdw            xm0, xm1
-    pmulhrsw            xm0, xm2
-    paddsw              xm0, xm0
-    pmulhrsw            xm0, xm2
-    pmulhrsw            xm0, xm3
-    vinserti128          m0, m0, xm0, 1
-    mova                 m1, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     pmulhrsw            xm0, xm1
-%ifidn %2, dct
     movd                xm2, [o(pw_2048)]
     pmulhrsw            xm0, xm1
     pmulhrsw            xm0, xm2
     vpbroadcastw         m0, xm0
     mova                 m1, m0
-%else ; adst / flipadst
-    vpbroadcastw         m0, xm0
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    vpbroadcastd         m1, [o(pw_2048)]
-    pmulhrsw             m1, m0
-%ifidn %2, adst
-    vpermq               m0, m1, q1100
-    vpermq               m1, m1, q3322
-%else ; flipadst
-    vpermq               m0, m1, q2233
-    vpermq               m1, m1, q0011
-%endif
-%endif
-%endif
     jmp m(iadst_8x4_internal).end3
 %endif
 %endmacro
 
-INV_TXFM_8X4_FN dct, dct,      0
-INV_TXFM_8X4_FN dct, adst,     0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
 
 cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd        xm3, [o(pw_2896x8)]
@@ -1425,7 +1226,7 @@ cglobal idct_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vinserti128          m3, m1, xm3, 1
     vinserti128          m1, m0, xm2, 1
     shufps               m0, m1, m3, q0220
-    shufps               m1, m1, m3, q1331
+    shufps               m1, m3, q1331
     pshufb               m0, m4
     pshufb               m1, m4
     jmp                tx2q
@@ -1449,8 +1250,8 @@ cglobal iadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
     call m(iadst_4x8_internal).main_pass1
-    vinserti128        m0, m0, xm2, 1
-    vinserti128        m1, m1, xm3, 1
+    vinserti128        m0, xm2, 1
+    vinserti128        m1, xm3, 1
     punpckhwd          m2, m0, m1
     punpcklwd          m0, m1
     pxor               m3, m3
@@ -1494,8 +1295,8 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pmulhrsw            xm4, xm0
     pmulhrsw            xm5, xm0
     call m(iadst_4x8_internal).main_pass1
-    vinserti128          m3, m3, xm1, 1
-    vinserti128          m2, m2, xm0, 1
+    vinserti128          m3, xm1, 1
+    vinserti128          m2, xm0, 1
     punpckhwd            m1, m3, m2
     punpcklwd            m3, m2
     pxor                 m0, m0
@@ -1510,16 +1311,16 @@ cglobal iflipadst_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, m2, q2031
     jmp m(iadst_8x4_internal).end2
 
-INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, dct
 INV_TXFM_8X4_FN identity, adst
 INV_TXFM_8X4_FN identity, flipadst
 INV_TXFM_8X4_FN identity, identity
 
 cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
-    mova                xm2,     [cq+16*0]
-    mova                xm0,     [cq+16*1]
-    vinserti128          m2, m2, [cq+16*2], 1
-    vinserti128          m0, m0, [cq+16*3], 1
+    mova                xm2, [cq+16*0]
+    mova                xm0, [cq+16*1]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m0, [cq+16*3], 1
     vpbroadcastd         m3, [o(pw_2896x8)]
     punpcklwd            m1, m2, m0
     punpckhwd            m2, m0
@@ -1538,25 +1339,9 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     paddsw               m1, m3
     jmp m(iadst_8x4_internal).end
 
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x8
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm0, [o(pw_2896x8)]
-    pmulhrsw            xm0, [cq]
-    vpbroadcastd        xm1, [o(pw_16384)]
-    pmulhrsw            xm0, xm1
-    psrlw               xm1, 2 ; pw_4096
-    pmulhrsw            xm0, xm1
-    pshufb              xm0, [o(deint_shuf)]
-    vpermq               m3, m0, q1100
-    punpcklwd            m3, m3
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    jmp m(iadst_8x8_internal).end4
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
     movd                xm2, [o(pw_16384)]
@@ -1576,33 +1361,13 @@ cglobal iidentity_8x4_internal, 0, 5, 7, dst, stride, c, eob, tx2
     dec                 r2d
     jg .loop
     RET
-%else ; identity
-    mova                 m0, [cq+32*0]
-    punpcklwd            m0, [cq+32*1]
-    mova                 m1, [cq+32*2]
-    punpcklwd            m1, [cq+32*3]
-    vpbroadcastd         m2, [o(pw_2896x8)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    pxor                 m4, m4
-    mova          [cq+32*0], m4
-    mova          [cq+32*1], m4
-    mova          [cq+32*2], m4
-    mova          [cq+32*3], m4
-    punpckldq            m0, m1
-    vpermq               m1, m0, q3232
-    vpermq               m0, m0, q1010
-    punpcklwd            m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m0, m3
-    jmp m(inv_txfm_add_dct_dct_8x8).end
-%endif
 %endif
 %endmacro
 
-INV_TXFM_8X8_FN dct, dct,      0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
 INV_TXFM_8X8_FN dct, adst
 INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
 
 cglobal idct_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpermq               m0, [cq+32*0], q3120 ; 0 1
@@ -1749,20 +1514,20 @@ cglobal iflipadst_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pmulhrsw             m0, m5, m4
     jmp m(iadst_8x8_internal).end3
 
-INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, dct
 INV_TXFM_8X8_FN identity, adst
 INV_TXFM_8X8_FN identity, flipadst
 INV_TXFM_8X8_FN identity, identity
 
 cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
-    mova                xm3,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
-    vinserti128          m3, m3, [cq+16*4], 1
-    vinserti128          m2, m2, [cq+16*5], 1
-    mova                xm4,     [cq+16*2]
-    mova                xm0,     [cq+16*3]
-    vinserti128          m4, m4, [cq+16*6], 1
-    vinserti128          m0, m0, [cq+16*7], 1
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*1]
+    vinserti128          m3, [cq+16*4], 1
+    vinserti128          m2, [cq+16*5], 1
+    mova                xm4, [cq+16*2]
+    mova                xm0, [cq+16*3]
+    vinserti128          m4, [cq+16*6], 1
+    vinserti128          m0, [cq+16*7], 1
     punpcklwd            m1, m3, m2
     punpckhwd            m3, m2
     punpcklwd            m2, m4, m0
@@ -1776,8 +1541,8 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastd         m4, [o(pw_4096)]
     jmp m(iadst_8x8_internal).end
 
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -1791,66 +1556,6 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     vpbroadcastw         m0, xm0
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_8x8).end2
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM      13
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    pmulhrsw             m7, m0, [cq]
-    vpbroadcastd         m1, [o(pw_16384)]
-    vpbroadcastd         m2, [o(pw_1697x16)]
-    pxor                 m3, m3
-    mova               [cq], m3
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 3 ; pw_2048
-    pmulhrsw             m2, m7
-    paddsw               m7, m7
-    paddsw               m7, m2
-    pmulhrsw             m7, m1
-    punpcklwd            m5, m7, m7
-    punpckhwd            m7, m7
-    punpcklwd            m4, m5, m5
-    punpckhwd            m5, m5
-    punpcklwd            m6, m7, m7
-    punpckhwd            m7, m7
-    vpermq               m0, m4, q1100
-    vpermq               m1, m5, q1100
-    vpermq               m2, m6, q1100
-    vpermq               m3, m7, q1100
-    vpermq               m4, m4, q3322
-    vpermq               m5, m5, q3322
-    vpermq               m6, m6, q3322
-    vpermq               m7, m7, q3322
-    jmp m(idct_8x16_internal).end4
-%elifidn %1_%2, identity_dct
-    movd                xm0, [cq+32*0]
-    punpcklwd           xm0, [cq+32*1]
-    movd                xm2, [cq+32*2]
-    punpcklwd           xm2, [cq+32*3]
-    add                  cq, 32*4
-    movd                xm1, [cq+32*0]
-    punpcklwd           xm1, [cq+32*1]
-    movd                xm3, [cq+32*2]
-    punpcklwd           xm3, [cq+32*3]
-    vpbroadcastd        xm4, [o(pw_2896x8)]
-    vpbroadcastd        xm5, [o(pw_2048)]
-    xor                 eax, eax
-    mov           [cq-32*4], eax
-    mov           [cq-32*3], eax
-    mov           [cq-32*2], eax
-    mov           [cq-32*1], eax
-    punpckldq           xm0, xm2
-    punpckldq           xm1, xm3
-    punpcklqdq          xm0, xm1
-    pmulhrsw            xm0, xm4
-    pmulhrsw            xm0, xm4
-    pmulhrsw            xm0, xm5
-    mov           [cq+32*0], eax
-    mov           [cq+32*1], eax
-    mov           [cq+32*2], eax
-    mov           [cq+32*3], eax
-    vinserti128          m0, m0, xm0, 1
-    mov                 r2d, 4
-    jmp m(inv_txfm_add_dct_dct_8x8).end2
 %endif
 %endmacro
 
@@ -1867,10 +1572,10 @@ cglobal iidentity_8x8_internal, 0, 5, 7, dst, stride, c, eob, tx2
     pmulhrsw             m4,     [cq+32*0]
 %endmacro
 
-INV_TXFM_8X16_FN dct, dct,      0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
 INV_TXFM_8X16_FN dct, adst
 INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
 
 cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_8X16_LOAD_COEFS
@@ -1878,13 +1583,13 @@ cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     vpbroadcastd        m10, [o(pw_16384)]
 .pass1_end:
     vperm2i128           m9, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     vperm2i128           m8, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     punpckhwd            m4, m2, m3
     punpcklwd            m2, m3
     punpckhwd            m3, m0, m1
@@ -1915,7 +1620,6 @@ cglobal idct_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
 .end3:
     pxor                 m8, m8
     REPX {mova [cq+32*x], m8}, -4, -3, -2, -1, 0, 1, 2, 3
-.end4:
     lea                  r3, [strideq*3]
     WRITE_8X4             0, 1, 8, 9
     lea                dstq, [dstq+strideq*4]
@@ -2120,7 +1824,7 @@ cglobal iflipadst_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     pmulhrsw             m7, m9, m8
     jmp m(idct_8x16_internal).end3
 
-INV_TXFM_8X16_FN identity, dct,      7
+INV_TXFM_8X16_FN identity, dct
 INV_TXFM_8X16_FN identity, adst
 INV_TXFM_8X16_FN identity, flipadst
 INV_TXFM_8X16_FN identity, identity
@@ -2136,24 +1840,24 @@ INV_TXFM_8X16_FN identity, identity
 %endmacro
 
 cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
-    mova                xm3,     [cq+16*0]
-    mova                xm2,     [cq+16*2]
+    mova                xm3, [cq+16*0]
+    mova                xm2, [cq+16*2]
     add                  cq, 16*8
-    vinserti128          m3, m3, [cq+16*0], 1
-    vinserti128          m2, m2, [cq+16*2], 1
+    vinserti128          m3, [cq+16*0], 1
+    vinserti128          m2, [cq+16*2], 1
     vpbroadcastd         m9, [o(pw_2896x8)]
-    mova                xm4,     [cq-16*4]
-    mova                xm5,     [cq-16*2]
-    vinserti128          m4, m4, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*6], 1
-    mova                xm7,     [cq-16*7]
-    mova                xm6,     [cq-16*5]
-    vinserti128          m7, m7, [cq+16*1], 1
-    vinserti128          m6, m6, [cq+16*3], 1
-    mova                xm8,     [cq-16*3]
-    mova                xm0,     [cq-16*1]
-    vinserti128          m8, m8, [cq+16*5], 1
-    vinserti128          m0, m0, [cq+16*7], 1
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*2]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*6], 1
+    mova                xm7, [cq-16*7]
+    mova                xm6, [cq-16*5]
+    vinserti128          m7, [cq+16*1], 1
+    vinserti128          m6, [cq+16*3], 1
+    mova                xm8, [cq-16*3]
+    mova                xm0, [cq-16*1]
+    vinserti128          m8, [cq+16*5], 1
+    vinserti128          m0, [cq+16*7], 1
     punpcklwd            m1, m3, m2
     punpckhwd            m3, m2
     punpcklwd            m2, m4, m5
@@ -2197,64 +1901,11 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     vextracti128  [dstq+%6], m%3, 1
 %endmacro
 
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x4
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    vpbroadcastd        xm3, [o(pw_2896x8)]
-    pmulhrsw            xm3, [cq]
-    vpbroadcastd        xm0, [o(pw_16384)]
-    vpbroadcastd        xm1, [o(pw_1697x8)]
-    pmulhrsw            xm3, xm0
-    psrlw               xm0, 3 ; pw_2048
-    pmulhrsw            xm1, xm3
-    paddsw              xm3, xm1
-    pmulhrsw            xm3, xm0
-    punpcklwd           xm3, xm3
-    punpckldq           xm1, xm3, xm3
-    punpckhdq           xm3, xm3
-    vpbroadcastq         m0, xm1
-    vpermq               m1, m1, q1111
-    vpbroadcastq         m2, xm3
-    vpermq               m3, m3, q1111
-    jmp m(iadst_16x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                xm0,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
-    vinserti128          m0, m0, [cq+16*4], 1
-    vinserti128          m2, m2, [cq+16*5], 1
-    mova                xm1,     [cq+16*2]
-    mova                xm3,     [cq+16*3]
-    vinserti128          m1, m1, [cq+16*6], 1
-    vinserti128          m3, m3, [cq+16*7], 1
-    vpbroadcastd         m4, [o(pw_1697x16)]
-    vpbroadcastd         m5, [o(pw_16384)]
-    packusdw             m0, m2
-    packusdw             m1, m3
-    packusdw             m0, m1
-    vpbroadcastd         m1, [o(pw_2896x8)]
-    pmulhrsw             m4, m0
-    pmulhrsw             m4, m5
-    paddsw               m0, m4
-    psrlw                m5, 3 ; pw_2048
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, m5
-    mov                 r3d, 2
-.end:
-    pxor                 m3, m3
-.end_loop:
-    mova          [cq+32*0], m3
-    mova          [cq+32*1], m3
-    add                  cq, 32*2
-    WRITE_16X2            0, 0, 1, 2, strideq*0, strideq*1
-    lea                dstq, [dstq+strideq*2]
-    dec                 r3d
-    jg .end_loop
-    RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4
+%ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
-%ifidn %2, dct
     movd                xm2, [o(pw_16384)]
     mov                [cq], eobd
     mov                 r2d, 2
@@ -2267,7 +1918,7 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     pxor                 m3, m3
 .dconly_loop:
     mova                xm1, [dstq]
-    vinserti128          m1, m1, [dstq+strideq], 1
+    vinserti128          m1, [dstq+strideq], 1
     punpckhbw            m2, m1, m3
     punpcklbw            m1, m3
     paddw                m2, m0
@@ -2279,35 +1930,13 @@ cglobal iidentity_8x16_internal, 0, 5, 13, dst, stride, c, eob, tx2
     dec                 r2d
     jg .dconly_loop
     RET
-%else ; adst / flipadst
-    movd                xm2, [o(pw_16384)]
-    pmulhrsw            xm0, xm2
-    vpbroadcastw         m0, xm0
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    vpbroadcastd         m3, [o(pw_2048)]
-    mov                [cq], eobd
-    pmulhrsw             m3, m0
-%ifidn %2, adst
-    vpbroadcastq         m0, xm3
-    vpermq               m1, m3, q1111
-    vpermq               m2, m3, q2222
-    vpermq               m3, m3, q3333
-%else ; flipadst
-    vpermq               m0, m3, q3333
-    vpermq               m1, m3, q2222
-    vpermq               m2, m3, q1111
-    vpbroadcastq         m3, xm3
-%endif
-    jmp m(iadst_16x4_internal).end3
-%endif
-%endif
 %endif
 %endmacro
 
-INV_TXFM_16X4_FN dct, dct,      0
-INV_TXFM_16X4_FN dct, adst,     0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
 
 cglobal idct_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     mova                xm0, [cq+16*0]
@@ -2481,20 +2110,20 @@ ALIGN function_align
     WRITE_16X2            1, 0, 4, 5, strideq*0, strideq*1
     RET
 
-INV_TXFM_16X4_FN identity, dct,      15
+INV_TXFM_16X4_FN identity, dct
 INV_TXFM_16X4_FN identity, adst
 INV_TXFM_16X4_FN identity, flipadst
 INV_TXFM_16X4_FN identity, identity
 
 cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
-    mova                xm2,     [cq+16*0]
-    mova                xm4,     [cq+16*1]
-    vinserti128          m2, m2, [cq+16*4], 1
-    vinserti128          m4, m4, [cq+16*5], 1
-    mova                xm0,     [cq+16*2]
-    mova                xm1,     [cq+16*3]
-    vinserti128          m0, m0, [cq+16*6], 1
-    vinserti128          m1, m1, [cq+16*7], 1
+    mova                xm2, [cq+16*0]
+    mova                xm4, [cq+16*1]
+    vinserti128          m2, [cq+16*4], 1
+    vinserti128          m4, [cq+16*5], 1
+    mova                xm0, [cq+16*2]
+    mova                xm1, [cq+16*3]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
     vpbroadcastd         m7, [o(pw_1697x16)]
     vpbroadcastd         m8, [o(pw_16384)]
     punpcklwd            m3, m2, m4
@@ -2531,8 +2160,8 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     paddsw               m3, m7
     jmp m(iadst_16x4_internal).end
 
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2541,59 +2170,6 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     pmulhrsw            xm0, xm1
     mov                 r2d, 4
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM      13
-    vbroadcasti128       m7, [cq]
-    vpbroadcastd         m0, [o(pw_2896x8)]
-    vpbroadcastd         m1, [o(pw_16384)]
-    pxor                xm2, xm2
-    mova               [cq], xm2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 2 ; pw_4096
-    pmulhrsw             m7, m1
-    punpcklwd            m3, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    pshufd               m4, m7, q0000
-    pshufd               m5, m7, q1111
-    pshufd               m6, m7, q2222
-    pshufd               m7, m7, q3333
-    lea                  r3, [strideq*3]
-    WRITE_16X2            0, 1, 8, 0, strideq*0, strideq*1
-    WRITE_16X2            2, 3, 0, 1, strideq*2, r3
-    jmp m(idct_16x8_internal).end4
-%elifidn %1_%2, identity_dct
-    mova                 m0, [cq+32*0]
-    packusdw             m0, [cq+32*1]
-    mova                 m2, [cq+32*2]
-    packusdw             m2, [cq+32*3]
-    mova                 m1, [cq+32*4]
-    packusdw             m1, [cq+32*5]
-    mova                 m3, [cq+32*6]
-    packusdw             m3, [cq+32*7]
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    vpbroadcastd         m5, [o(pw_1697x16)]
-    packusdw             m0, m2
-    packusdw             m1, m3
-    vpbroadcastd         m2, [o(pw_16384)]
-    packusdw             m0, m1
-    vpermq               m1, m0, q3322
-    vpermq               m0, m0, q1100
-    punpcklwd            m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m5, m0
-    pmulhrsw             m5, m2
-    paddsw               m0, m5
-    psrlw                m2, 3 ; pw_2048
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m2
-    mov                 r3d, 4
-    jmp m(inv_txfm_add_identity_dct_16x4).end
 %endif
 %endmacro
 
@@ -2611,10 +2187,10 @@ cglobal iidentity_16x4_internal, 0, 5, 11, dst, stride, c, eob, tx2
     REPX   {pmulhrsw x, m8}, m0, m7, m1, m6, m2, m5, m3, m4
 %endmacro
 
-INV_TXFM_16X8_FN dct, dct,      0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
 INV_TXFM_16X8_FN dct, adst
 INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
 
 cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     ITX_16X8_LOAD_COEFS 3120
@@ -2648,13 +2224,13 @@ cglobal idct_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     punpckldq            m8, m9, m5
     punpckhdq            m9, m5
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m8, 0x31
-    vinserti128          m2, m2, xm8, 1
+    vinserti128          m2, xm8, 1
     vperm2i128           m7, m3, m9, 0x31
-    vinserti128          m3, m3, xm9, 1
+    vinserti128          m3, xm9, 1
     jmp                tx2q
 .pass2:
     call .main
@@ -2811,13 +2387,13 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     punpckldq            m5, m8, m2
     punpckhdq            m8, m2
     vinserti128          m2, m6, xm5, 1
-    vperm2i128           m6, m6, m5, 0x31
+    vperm2i128           m6, m5, 0x31
     vperm2i128           m5, m1, m4, 0x31
-    vinserti128          m1, m1, xm4, 1
+    vinserti128          m1, xm4, 1
     vperm2i128           m4, m0, m3, 0x31
-    vinserti128          m0, m0, xm3, 1
+    vinserti128          m0, xm3, 1
     vinserti128          m3, m7, xm8, 1
-    vperm2i128           m7, m7, m8, 0x31
+    vperm2i128           m7, m8, 0x31
     jmp                tx2q
 .pass2:
     call m(iadst_16x8_internal).main
@@ -2837,30 +2413,30 @@ cglobal iflipadst_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     WRITE_16X2            1, 2, 0, 1, strideq*2, r3
     jmp m(idct_16x8_internal).end3
 
-INV_TXFM_16X8_FN identity, dct,      15
+INV_TXFM_16X8_FN identity, dct
 INV_TXFM_16X8_FN identity, adst
 INV_TXFM_16X8_FN identity, flipadst
 INV_TXFM_16X8_FN identity, identity
 
 cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
-    mova                xm7,     [cq+16*0]
-    mova                xm2,     [cq+16*1]
+    mova                xm7, [cq+16*0]
+    mova                xm2, [cq+16*1]
     add                  cq, 16*8
     vpbroadcastd         m3, [o(pw_2896x8)]
-    vinserti128          m7, m7, [cq+16*0], 1
-    vinserti128          m2, m2, [cq+16*1], 1
-    mova                xm6,     [cq-16*6]
-    mova                xm4,     [cq-16*5]
-    vinserti128          m6, m6, [cq+16*2], 1
-    vinserti128          m4, m4, [cq+16*3], 1
-    mova                xm8,     [cq-16*4]
-    mova                xm5,     [cq-16*3]
-    vinserti128          m8, m8, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*5], 1
-    mova                xm0,     [cq-16*2]
-    mova                xm1,     [cq-16*1]
-    vinserti128          m0, m0, [cq+16*6], 1
-    vinserti128          m1, m1, [cq+16*7], 1
+    vinserti128          m7, [cq+16*0], 1
+    vinserti128          m2, [cq+16*1], 1
+    mova                xm6, [cq-16*6]
+    mova                xm4, [cq-16*5]
+    vinserti128          m6, [cq+16*2], 1
+    vinserti128          m4, [cq+16*3], 1
+    mova                xm8, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m8, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm0, [cq-16*2]
+    mova                xm1, [cq-16*1]
+    vinserti128          m0, [cq+16*6], 1
+    vinserti128          m1, [cq+16*7], 1
     vpbroadcastd        m10, [o(pw_1697x16)]
     vpbroadcastd        m11, [o(pw_16384)]
     REPX   {pmulhrsw x, m3}, m7, m2, m6, m4, m8, m5, m0, m1
@@ -2896,8 +2472,8 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
 
 %define o_base pw_5 + 128
 
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16
 %ifidn %1_%2, dct_dct
     movd                xm1, [o(pw_2896x8)]
     pmulhrsw            xm0, xm1, [cq]
@@ -2905,72 +2481,6 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     mov                [cq], eobd
     mov                 r2d, 8
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
-%elifidn %1_%2, dct_identity
-    WIN64_SPILL_XMM       7
-    vpbroadcastd         m3, [o(pw_2896x8)]
-    pmulhrsw             m3, [cq]
-    vpbroadcastd         m0, [o(pw_8192)]
-    vpbroadcastd         m1, [o(pw_1697x16)]
-    vpbroadcastw         m4, [o(deint_shuf)] ; pb_0_1
-    pcmpeqb              m5, m5
-    pxor                 m6, m6
-    mova               [cq], m6
-    paddb                m5, m5 ; pb_m2
-    pmulhrsw             m3, m0
-    psrlw                m0, 2  ; pw_2048
-    IDTX16                3, 1, 1
-    pmulhrsw             m3, m0
-    mov                 r3d, 8
-.loop:
-    mova                xm1, [dstq]
-    vinserti128          m1, m1, [dstq+strideq*8], 1
-    pshufb               m0, m3, m4
-    psubb                m4, m5 ; += 2
-    punpckhbw            m2, m1, m6
-    punpcklbw            m1, m6
-    paddw                m2, m0
-    paddw                m1, m0
-    packuswb             m1, m2
-    mova             [dstq], xm1
-    vextracti128 [dstq+strideq*8], m1, 1
-    add                dstq, strideq
-    dec                 r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    movd                xm0,     [cq+32*0 ]
-    movd                xm2,     [cq+32*1 ]
-    movd                xm1,     [cq+32*2 ]
-    movd                xm3,     [cq+32*3 ]
-    vinserti128          m0, m0, [cq+32*8 ], 1
-    vinserti128          m2, m2, [cq+32*9 ], 1
-    vinserti128          m1, m1, [cq+32*10], 1
-    vinserti128          m3, m3, [cq+32*11], 1
-    punpcklwd            m0, m2
-    punpcklwd            m1, m3
-    punpckldq            m0, m1
-    movd                xm1,     [cq+32*4 ]
-    movd                xm3,     [cq+32*5 ]
-    movd                xm2,     [cq+32*6 ]
-    movd                xm4,     [cq+32*7 ]
-    vinserti128          m1, m1, [cq+32*12], 1
-    vinserti128          m3, m3, [cq+32*13], 1
-    vinserti128          m2, m2, [cq+32*14], 1
-    vinserti128          m4, m4, [cq+32*15], 1
-    punpcklwd            m1, m3
-    vpbroadcastd         m3, [o(pw_1697x16)]
-    punpcklwd            m2, m4
-    vpbroadcastd         m4, [o(pw_2896x8)]
-    punpckldq            m1, m2
-    vpbroadcastd         m2, [o(pw_2048)]
-    punpcklqdq           m0, m1
-    pmulhrsw             m3, m0
-    psraw                m3, 1
-    pavgw                m0, m3
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m2
-    mov                 r3d, 8
-    jmp m(inv_txfm_add_identity_dct_16x4).end
 %endif
 %endmacro
 
@@ -2995,10 +2505,10 @@ cglobal iidentity_16x8_internal, 0, 5, 13, dst, stride, c, eob, tx2
     mova              [rsp], m15
 %endmacro
 
-INV_TXFM_16X16_FN dct, dct,      0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
 INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
 
 cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     ITX_16X16_LOAD_COEFS
@@ -3014,19 +2524,19 @@ cglobal idct_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     REPX   {pmulhrsw x, m1}, m3, m5, m7, m9, m11, m13, m15
     pmulhrsw             m1, [rsp+32*1]
     vperm2i128           m8, m1, m9, 0x31
-    vinserti128          m1, m1, xm9, 1
+    vinserti128          m1, xm9, 1
     vperm2i128           m9, m2, m10, 0x31
-    vinserti128          m2, m2, xm10, 1
+    vinserti128          m2, xm10, 1
     vperm2i128          m10, m3, m11, 0x31
-    vinserti128          m3, m3, xm11, 1
+    vinserti128          m3, xm11, 1
     vperm2i128          m11, m4, m12, 0x31
-    vinserti128          m4, m4, xm12, 1
+    vinserti128          m4, xm12, 1
     vperm2i128          m12, m5, m13, 0x31
-    vinserti128          m5, m5, xm13, 1
+    vinserti128          m5, xm13, 1
     vperm2i128          m13, m6, m14, 0x31
-    vinserti128          m6, m6, xm14, 1
+    vinserti128          m6, xm14, 1
     vperm2i128          m14, m7, m15, 0x31
-    vinserti128          m7, m7, xm15, 1
+    vinserti128          m7, xm15, 1
     mova                m15, [rsp+32*2]
 .pass1_end3:
     punpcklwd            m0, m9, m10
@@ -3395,7 +2905,7 @@ cglobal iflipadst_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
     pavgw               m%1, m%2 ; signs are guaranteed to be equal
 %endmacro
 
-INV_TXFM_16X16_FN identity, dct,      15
+INV_TXFM_16X16_FN identity, dct
 INV_TXFM_16X16_FN identity, identity
 
 cglobal iidentity_16x16_internal, 0, 5, 16, 32*3, dst, stride, c, eob, tx2
@@ -3456,7 +2966,7 @@ ALIGN function_align
     paddsw              m15, m1
     jmp m(idct_16x16_internal).end
 
-%define o_base iadst4_dconly2a + 128
+%define o_base deint_shuf + 128
 
 %macro LOAD_8ROWS 2-3 0 ; src, stride, is_rect2
 %if %3
@@ -3526,13 +3036,13 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
     LOAD_8ROWS      cq+32*1, 32*2
     call m(idct_16x8_internal).main
     vperm2i128          m11, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     vperm2i128           m4, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     pxor                 m7, m7
     REPX {mova [cq+32*x], m7}, 1, 3, 5, 7, 9, 11, 13, 15
     punpckhwd            m7, m0, m1
@@ -3566,13 +3076,13 @@ cglobal inv_txfm_add_dct_dct_8x32, 4, 4, 0, dst, stride, c, eob
     LOAD_8ROWS      cq+32*0, 32*2
     call m(idct_16x8_internal).main
     vperm2i128           m8, m0, m4, 0x31
-    vinserti128          m0, m0, xm4, 1
+    vinserti128          m0, xm4, 1
     vperm2i128           m4, m1, m5, 0x31
-    vinserti128          m1, m1, xm5, 1
+    vinserti128          m1, xm5, 1
     vperm2i128           m5, m2, m6, 0x31
-    vinserti128          m2, m2, xm6, 1
+    vinserti128          m2, xm6, 1
     vperm2i128           m6, m3, m7, 0x31
-    vinserti128          m3, m3, xm7, 1
+    vinserti128          m3, xm7, 1
     vpbroadcastd         m9, [o(pw_8192)]
     pxor                 m7, m7
     REPX {mova [cq+32*x], m7}, 0, 2, 4, 6, 8, 10, 12, 14
@@ -3775,7 +3285,7 @@ ALIGN function_align
 %macro LOAD_PACKED_16X2 4 ; dst, tmp, row[1-2]
     vbroadcasti128      m%1, [cq+16*%3]
     vbroadcasti128      m%2, [cq+16*%4]
-    shufpd              m%1, m%1, m%2, 0x0c
+    shufpd              m%1, m%2, 0x0c
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
@@ -3877,13 +3387,13 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
     pmulhrsw            m12, [rsp+32*0]
     mova         [rsp+32*0], m8
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m9, 0x31
-    vinserti128          m2, m2, xm9, 1
+    vinserti128          m2, xm9, 1
     vperm2i128           m7, m3, m10, 0x31
-    vinserti128          m3, m3, xm10, 1
+    vinserti128          m3, xm10, 1
     call m(idct_16x8_internal).main
     vpbroadcastd         m8, [o(pw_2048)]
     REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3922,13 +3432,13 @@ cglobal inv_txfm_add_dct_dct_32x8, 4, 4, 0, dst, stride, c, eob
     punpckldq            m9, m12, m5
     punpckhdq           m12, m5
     vperm2i128           m4, m0, m6, 0x31
-    vinserti128          m0, m0, xm6, 1
+    vinserti128          m0, xm6, 1
     vperm2i128           m5, m1, m7, 0x31
-    vinserti128          m1, m1, xm7, 1
+    vinserti128          m1, xm7, 1
     vperm2i128           m6, m2, m9, 0x31
-    vinserti128          m2, m2, xm9, 1
+    vinserti128          m2, xm9, 1
     vperm2i128           m7, m3, m12, 0x31
-    vinserti128          m3, m3, xm12, 1
+    vinserti128          m3, xm12, 1
     call m(idct_16x8_internal).main2
     vpbroadcastd         m8, [o(pw_2048)]
     REPX   {pmulhrsw x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
@@ -3947,26 +3457,26 @@ cglobal inv_txfm_add_identity_identity_8x32, 4, 5, 11, dst, stride, c, eob
     lea                  r4, [strideq*3]
     sub                eobd, 107 ; loop_iterations = 1 + (eobd >= 107)
 .loop:
-    mova                xm0,     [cq+16* 0]
-    mova                xm1,     [cq+16* 4]
-    vinserti128          m0, m0, [cq+16* 1], 1
-    vinserti128          m1, m1, [cq+16* 5], 1
+    mova                xm0,[cq+16* 0]
+    mova                xm1, [cq+16* 4]
+    vinserti128          m0, [cq+16* 1], 1
+    vinserti128          m1, [cq+16* 5], 1
     pxor                 m8, m8
     mova          [cq+32*0], m8
     mova          [cq+32*2], m8
     add                  cq, 16*16
-    mova                xm2,     [cq-16* 8]
-    mova                xm3,     [cq-16* 4]
-    vinserti128          m2, m2, [cq-16* 7], 1
-    vinserti128          m3, m3, [cq-16* 3], 1
-    mova                xm4,     [cq+16* 0]
-    mova                xm5,     [cq+16* 4]
-    vinserti128          m4, m4, [cq+16* 1], 1
-    vinserti128          m5, m5, [cq+16* 5], 1
-    mova                xm6,     [cq+16* 8]
-    mova                xm7,     [cq+16*12]
-    vinserti128          m6, m6, [cq+16* 9], 1
-    vinserti128          m7, m7, [cq+16*13], 1
+    mova                xm2, [cq-16* 8]
+    mova                xm3, [cq-16* 4]
+    vinserti128          m2, [cq-16* 7], 1
+    vinserti128          m3, [cq-16* 3], 1
+    mova                xm4, [cq+16* 0]
+    mova                xm5, [cq+16* 4]
+    vinserti128          m4, [cq+16* 1], 1
+    vinserti128          m5, [cq+16* 5], 1
+    mova                xm6, [cq+16* 8]
+    mova                xm7, [cq+16*12]
+    vinserti128          m6, [cq+16* 9], 1
+    vinserti128          m7, [cq+16*13], 1
     REPX {mova [cq+32*x], m8}, -4, -2,  0,  2,  4,  6
     REPX  {paddsw    x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
     call .transpose8x8
@@ -4019,22 +3529,22 @@ cglobal inv_txfm_add_identity_identity_32x8, 4, 6, 10, dst, stride, c, eob
     lea                  r5, [dstq+strideq*4]
     sub                eobd, 107
 .loop:
-    mova                xm0,     [cq-16*8]
-    mova                xm1,     [cq-16*7]
-    vinserti128          m0, m0, [cq+16*0], 1
-    vinserti128          m1, m1, [cq+16*1], 1
-    mova                xm2,     [cq-16*6]
-    mova                xm3,     [cq-16*5]
-    vinserti128          m2, m2, [cq+16*2], 1
-    vinserti128          m3, m3, [cq+16*3], 1
-    mova                xm4,     [cq-16*4]
-    mova                xm5,     [cq-16*3]
-    vinserti128          m4, m4, [cq+16*4], 1
-    vinserti128          m5, m5, [cq+16*5], 1
-    mova                xm6,     [cq-16*2]
-    mova                xm7,     [cq-16*1]
-    vinserti128          m6, m6, [cq+16*6], 1
-    vinserti128          m7, m7, [cq+16*7], 1
+    mova                xm0, [cq-16*8]
+    mova                xm1, [cq-16*7]
+    vinserti128          m0, [cq+16*0], 1
+    vinserti128          m1, [cq+16*1], 1
+    mova                xm2, [cq-16*6]
+    mova                xm3, [cq-16*5]
+    vinserti128          m2, [cq+16*2], 1
+    vinserti128          m3, [cq+16*3], 1
+    mova                xm4, [cq-16*4]
+    mova                xm5, [cq-16*3]
+    vinserti128          m4, [cq+16*4], 1
+    vinserti128          m5, [cq+16*5], 1
+    mova                xm6, [cq-16*2]
+    mova                xm7, [cq-16*1]
+    vinserti128          m6, [cq+16*6], 1
+    vinserti128          m7, [cq+16*7], 1
     pxor                 m8, m8
     REPX {mova [cq+32*x], m8}, -4, -3, -2, -1,  0,  1,  2,  3
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4206,28 +3716,28 @@ cglobal inv_txfm_add_dct_dct_16x32, 4, 4, 0, dst, stride, c, eob
     vextracti128 [r2+32*3+16], m14, 1
     vinserti128          m8, m1, xm9, 1
     vperm2i128          m12, m1, m9, 0x31
-    mova                xm0,     [tmp1q-32*4]
-    mova                xm1,     [tmp1q-32*3]
-    vinserti128          m0, m0, [tmp1q+32*0], 1
-    vinserti128          m1, m1, [tmp1q+32*1], 1
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp1q+32*0], 1
+    vinserti128          m1, [tmp1q+32*1], 1
     vinserti128         m10, m5, xm13, 1
     vperm2i128          m14, m5, m13, 0x31
-    mova                xm4,     [tmp1q-32*4+16]
-    mova                xm5,     [tmp1q-32*3+16]
-    vinserti128          m4, m4, [tmp1q+32*0+16], 1
-    vinserti128          m5, m5, [tmp1q+32*1+16], 1
+    mova                xm4, [tmp1q-32*4+16]
+    mova                xm5, [tmp1q-32*3+16]
+    vinserti128          m4, [tmp1q+32*0+16], 1
+    vinserti128          m5, [tmp1q+32*1+16], 1
     vinserti128          m9, m3, xm11, 1
     vperm2i128          m13, m3, m11, 0x31
-    mova                xm2,     [tmp1q-32*2]
-    mova                xm3,     [tmp1q-32*1]
-    vinserti128          m2, m2, [tmp1q+32*2], 1
-    vinserti128          m3, m3, [tmp1q+32*3], 1
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp1q+32*2], 1
+    vinserti128          m3, [tmp1q+32*3], 1
     vinserti128         m11, m7, xm15, 1
     vperm2i128          m15, m7, m15, 0x31
-    mova                xm6,     [tmp1q-32*2+16]
-    mova                xm7,     [tmp1q-32*1+16]
-    vinserti128          m6, m6, [tmp1q+32*2+16], 1
-    vinserti128          m7, m7, [tmp1q+32*3+16], 1
+    mova                xm6, [tmp1q-32*2+16]
+    mova                xm7, [tmp1q-32*1+16]
+    vinserti128          m6, [tmp1q+32*2+16], 1
+    vinserti128          m7, [tmp1q+32*3+16], 1
     call .main_oddhalf
     LOAD_8ROWS_H    r2-32*4, 32
 .idct16:
@@ -4475,7 +3985,7 @@ ALIGN function_align
     mova         [tmp1q+32*(11-%2)], xm%2
     vextracti128 [tmp2q+32*( 3-%1)], m%2, 1
     vperm2i128          m%2, m%1, m%4, 0x31
-    vinserti128         m%1, m%1, xm%4, 1
+    vinserti128         m%1, xm%4, 1
 %endmacro
 
 cglobal inv_txfm_add_dct_dct_32x16, 4, 4, 0, dst, stride, c, eob
@@ -4593,22 +4103,22 @@ cglobal inv_txfm_add_identity_identity_16x32, 4, 5, 13, dst, stride, c, eob
     mov                 rax, cq
     paddw               m11, m12, m12 ; pw_16384
 .loop:
-    mova                xm0,     [cq+64* 0]
-    mova                xm1,     [cq+64* 1]
-    vinserti128          m0, m0, [cq+64* 8], 1
-    vinserti128          m1, m1, [cq+64* 9], 1
-    mova                xm2,     [cq+64* 2]
-    mova                xm3,     [cq+64* 3]
-    vinserti128          m2, m2, [cq+64*10], 1
-    vinserti128          m3, m3, [cq+64*11], 1
-    mova                xm4,     [cq+64* 4]
-    mova                xm5,     [cq+64* 5]
-    vinserti128          m4, m4, [cq+64*12], 1
-    vinserti128          m5, m5, [cq+64*13], 1
-    mova                xm6,     [cq+64* 6]
-    mova                xm7,     [cq+64* 7]
-    vinserti128          m6, m6, [cq+64*14], 1
-    vinserti128          m7, m7, [cq+64*15], 1
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
     REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
     REPX  {IDTX16 x, 8, 10, 11}, 0, 1, 2, 3, 4, 5, 6, 7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4661,22 +4171,22 @@ cglobal inv_txfm_add_identity_identity_32x16, 4, 6, 12, dst, stride, c, eob
     mov                  r5, dstq
     mov                 rax, cq
 .loop:
-    mova                xm0,     [cq+32* 0]
-    mova                xm1,     [cq+32* 1]
-    vinserti128          m0, m0, [cq+32* 8], 1
-    vinserti128          m1, m1, [cq+32* 9], 1
-    mova                xm2,     [cq+32* 2]
-    mova                xm3,     [cq+32* 3]
-    vinserti128          m2, m2, [cq+32*10], 1
-    vinserti128          m3, m3, [cq+32*11], 1
-    mova                xm4,     [cq+32* 4]
-    mova                xm5,     [cq+32* 5]
-    vinserti128          m4, m4, [cq+32*12], 1
-    vinserti128          m5, m5, [cq+32*13], 1
-    mova                xm6,     [cq+32* 6]
-    mova                xm7,     [cq+32* 7]
-    vinserti128          m6, m6, [cq+32*14], 1
-    vinserti128          m7, m7, [cq+32*15], 1
+    mova                xm0, [cq+32* 0]
+    mova                xm1, [cq+32* 1]
+    vinserti128          m0, [cq+32* 8], 1
+    vinserti128          m1, [cq+32* 9], 1
+    mova                xm2, [cq+32* 2]
+    mova                xm3, [cq+32* 3]
+    vinserti128          m2, [cq+32*10], 1
+    vinserti128          m3, [cq+32*11], 1
+    mova                xm4, [cq+32* 4]
+    mova                xm5, [cq+32* 5]
+    vinserti128          m4, [cq+32*12], 1
+    vinserti128          m5, [cq+32*13], 1
+    mova                xm6, [cq+32* 6]
+    mova                xm7, [cq+32* 7]
+    vinserti128          m6, [cq+32*14], 1
+    vinserti128          m7, [cq+32*15], 1
     REPX  {pmulhrsw x, m9 }, m0, m1, m2, m3, m4, m5, m6, m7
     REPX  {paddsw   x, x  }, m0, m1, m2, m3, m4, m5, m6, m7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
@@ -4864,22 +4374,22 @@ cglobal inv_txfm_add_identity_identity_32x32, 4, 6, 10, dst, stride, c, eob
     mov                  r5, dstq
     lea                 rax, [cq+32]
 .loop:
-    mova                xm0,     [cq+64* 0]
-    mova                xm1,     [cq+64* 1]
-    vinserti128          m0, m0, [cq+64* 8], 1
-    vinserti128          m1, m1, [cq+64* 9], 1
-    mova                xm2,     [cq+64* 2]
-    mova                xm3,     [cq+64* 3]
-    vinserti128          m2, m2, [cq+64*10], 1
-    vinserti128          m3, m3, [cq+64*11], 1
-    mova                xm4,     [cq+64* 4]
-    mova                xm5,     [cq+64* 5]
-    vinserti128          m4, m4, [cq+64*12], 1
-    vinserti128          m5, m5, [cq+64*13], 1
-    mova                xm6,     [cq+64* 6]
-    mova                xm7,     [cq+64* 7]
-    vinserti128          m6, m6, [cq+64*14], 1
-    vinserti128          m7, m7, [cq+64*15], 1
+    mova                xm0, [cq+64* 0]
+    mova                xm1, [cq+64* 1]
+    vinserti128          m0, [cq+64* 8], 1
+    vinserti128          m1, [cq+64* 9], 1
+    mova                xm2, [cq+64* 2]
+    mova                xm3, [cq+64* 3]
+    vinserti128          m2, [cq+64*10], 1
+    vinserti128          m3, [cq+64*11], 1
+    mova                xm4, [cq+64* 4]
+    mova                xm5, [cq+64* 5]
+    vinserti128          m4, [cq+64*12], 1
+    vinserti128          m5, [cq+64*13], 1
+    mova                xm6, [cq+64* 6]
+    mova                xm7, [cq+64* 7]
+    vinserti128          m6, [cq+64*14], 1
+    vinserti128          m7, [cq+64*15], 1
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
     REPX   {pmulhrsw x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
     WRITE_16X2            0,  1,  8,  0, strideq*0, strideq*1
@@ -5022,27 +4532,27 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
     add                eobd, 0x80000000
     jnc .pass1_loop
     lea                  r2, [rsp+32*23]
-    mova                xm0,     [r2-32*4+ 0]
-    mova                xm1,     [r2-32*2+ 0]
-    vinserti128          m0, m0, [r2+32*0+ 0], 1
-    vinserti128          m1, m1, [r2+32*2+ 0], 1
-    mova                xm2,     [r2-32*4+16]
-    mova                xm3,     [r2-32*2+16]
-    vinserti128          m2, m2, [r2+32*0+16], 1
-    vinserti128          m3, m3, [r2+32*2+16], 1
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm1, [r2-32*2+ 0]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m1, [r2+32*2+ 0], 1
+    mova                xm2, [r2-32*4+16]
+    mova                xm3, [r2-32*2+16]
+    vinserti128          m2, [r2+32*0+16], 1
+    vinserti128          m3, [r2+32*2+16], 1
     pxor                 m4, m4
     REPX       {mova x, m4}, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14
     test                r7d, r7d
     jl .fast
     lea                  r3, [r2+32*8]
-    mova                xm4,     [r3-32*4+ 0]
-    mova                xm5,     [r3-32*2+ 0]
-    vinserti128          m4, m4, [r3+32*0+ 0], 1
-    vinserti128          m5, m5, [r3+32*2+ 0], 1
-    mova                xm6,     [r3-32*4+16]
-    mova                xm7,     [r3-32*2+16]
-    vinserti128          m6, m6, [r3+32*0+16], 1
-    vinserti128          m7, m7, [r3+32*2+16], 1
+    mova                xm4, [r3-32*4+ 0]
+    mova                xm5, [r3-32*2+ 0]
+    vinserti128          m4, [r3+32*0+ 0], 1
+    vinserti128          m5, [r3+32*2+ 0], 1
+    mova                xm6, [r3-32*4+16]
+    mova                xm7, [r3-32*2+16]
+    vinserti128          m6, [r3+32*0+16], 1
+    vinserti128          m7, [r3+32*2+16], 1
 .fast:
     mova              [rsp], m8
     lea               tmp1q, [rsp+32*7]
@@ -5065,26 +4575,26 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
     mova       [tmp1q+32*1], m13
     mova       [tmp1q+32*2], m14
     mova       [tmp1q+32*3], m15
-    mova                xm0,     [r2-32*3+ 0]
-    mova                xm1,     [r2-32*1+ 0]
-    vinserti128          m0, m0, [r2+32*1+ 0], 1
-    vinserti128          m1, m1, [r2+32*3+ 0], 1
-    mova                xm2,     [r2-32*3+16]
-    mova                xm3,     [r2-32*1+16]
-    vinserti128          m2, m2, [r2+32*1+16], 1
-    vinserti128          m3, m3, [r2+32*3+16], 1
+    mova                xm0, [r2-32*3+ 0]
+    mova                xm1, [r2-32*1+ 0]
+    vinserti128          m0, [r2+32*1+ 0], 1
+    vinserti128          m1, [r2+32*3+ 0], 1
+    mova                xm2, [r2-32*3+16]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m2, [r2+32*1+16], 1
+    vinserti128          m3, [r2+32*3+16], 1
     pxor                 m4, m4
     REPX       {mova x, m4}, m5, m6, m7
     test                r7d, r7d
     jl .fast2
-    mova                xm4,     [r3-32*3+ 0]
-    mova                xm5,     [r3-32*1+ 0]
-    vinserti128          m4, m4, [r3+32*1+ 0], 1
-    vinserti128          m5, m5, [r3+32*3+ 0], 1
-    mova                xm6,     [r3-32*3+16]
-    mova                xm7,     [r3-32*1+16]
-    vinserti128          m6, m6, [r3+32*1+16], 1
-    vinserti128          m7, m7, [r3+32*3+16], 1
+    mova                xm4, [r3-32*3+ 0]
+    mova                xm5, [r3-32*1+ 0]
+    vinserti128          m4, [r3+32*1+ 0], 1
+    vinserti128          m5, [r3+32*3+ 0], 1
+    mova                xm6, [r3-32*3+16]
+    mova                xm7, [r3-32*1+16]
+    vinserti128          m6, [r3+32*1+16], 1
+    vinserti128          m7, [r3+32*3+16], 1
 .fast2:
     add               tmp1q, 32*8
     lea               tmp2q, [tmp1q+32*8]
@@ -5093,53 +4603,53 @@ cglobal inv_txfm_add_dct_dct_16x64, 4, 4, 0, dst, stride, c, eob
     vpbroadcastd        m15, [o(pd_2048)]
     add               tmp1q, 32*16
     add               tmp2q, 32*32
-    mova                xm0,     [r2-32*4+ 0]
-    mova                xm3,     [r2-32*1+16]
-    vinserti128          m0, m0, [r2+32*0+ 0], 1
-    vinserti128          m3, m3, [r2+32*3+16], 1
-    mova                xm4,     [r2-32*4+16]
-    mova                xm7,     [r2-32*1+ 0]
-    vinserti128          m4, m4, [r2+32*0+16], 1
-    vinserti128          m7, m7, [r2+32*3+ 0], 1
+    mova                xm0, [r2-32*4+ 0]
+    mova                xm3, [r2-32*1+16]
+    vinserti128          m0, [r2+32*0+ 0], 1
+    vinserti128          m3, [r2+32*3+16], 1
+    mova                xm4, [r2-32*4+16]
+    mova                xm7, [r2-32*1+ 0]
+    vinserti128          m4, [r2+32*0+16], 1
+    vinserti128          m7, [r2+32*3+ 0], 1
     pxor                 m1, m1
     REPX       {mova x, m1}, m2, m5, m6
     test                r7d, r7d
     jl .fast3
     add                  r3, 32*24
-    mova                xm1,     [r3-32*1+16]
-    mova                xm2,     [r3-32*4+ 0]
-    vinserti128          m1, m1, [r3+32*3+16], 1
-    vinserti128          m2, m2, [r3+32*0+ 0], 1
-    mova                xm5,     [r3-32*1+ 0]
-    mova                xm6,     [r3-32*4+16]
-    vinserti128          m5, m5, [r3+32*3+ 0], 1
-    vinserti128          m6, m6, [r3+32*0+16], 1
+    mova                xm1, [r3-32*1+16]
+    mova                xm2, [r3-32*4+ 0]
+    vinserti128          m1, [r3+32*3+16], 1
+    vinserti128          m2, [r3+32*0+ 0], 1
+    mova                xm5, [r3-32*1+ 0]
+    mova                xm6, [r3-32*4+16]
+    vinserti128          m5, [r3+32*3+ 0], 1
+    vinserti128          m6, [r3+32*0+16], 1
 .fast3:
     add                 rax, o_idct64_offset
     call m(inv_txfm_add_dct_dct_16x64).main_part1
     add                 rax, 8
     add               tmp1q, 32*8
     sub               tmp2q, 32*8
-    mova                xm0,     [r2-32*2+ 0]
-    mova                xm3,     [r2-32*3+16]
-    vinserti128          m0, m0, [r2+32*2+ 0], 1
-    vinserti128          m3, m3, [r2+32*1+16], 1
-    mova                xm4,     [r2-32*2+16]
-    mova                xm7,     [r2-32*3+ 0]
-    vinserti128          m4, m4, [r2+32*2+16], 1
-    vinserti128          m7, m7, [r2+32*1+ 0], 1
+    mova                xm0, [r2-32*2+ 0]
+    mova                xm3, [r2-32*3+16]
+    vinserti128          m0, [r2+32*2+ 0], 1
+    vinserti128          m3, [r2+32*1+16], 1
+    mova                xm4, [r2-32*2+16]
+    mova                xm7, [r2-32*3+ 0]
+    vinserti128          m4, [r2+32*2+16], 1
+    vinserti128          m7, [r2+32*1+ 0], 1
     pxor                 m1, m1
     REPX       {mova x, m1}, m2, m5, m6
     test                r7d, r7d
     jl .fast4
-    mova                xm1,     [r3-32*3+16]
-    mova                xm2,     [r3-32*2+ 0]
-    vinserti128          m1, m1, [r3+32*1+16], 1
-    vinserti128          m2, m2, [r3+32*2+ 0], 1
-    mova                xm5,     [r3-32*3+ 0]
-    mova                xm6,     [r3-32*2+16]
-    vinserti128          m5, m5, [r3+32*1+ 0], 1
-    vinserti128          m6, m6, [r3+32*2+16], 1
+    mova                xm1, [r3-32*3+16]
+    mova                xm2, [r3-32*2+ 0]
+    vinserti128          m1, [r3+32*1+16], 1
+    vinserti128          m2, [r3+32*2+ 0], 1
+    mova                xm5, [r3-32*3+ 0]
+    mova                xm6, [r3-32*2+16]
+    vinserti128          m5, [r3+32*1+ 0], 1
+    vinserti128          m6, [r3+32*2+16], 1
 .fast4:
     call m(inv_txfm_add_dct_dct_16x64).main_part1
     call m(inv_txfm_add_dct_dct_16x64).main_part2_pass2
@@ -5423,38 +4933,38 @@ cglobal inv_txfm_add_dct_dct_64x16, 4, 4, 0, dst, stride, c, eob
     mov               tmp2d, 4
 .pass2_loop:
     lea                  r3, [tmp1q-32*8]
-    mova                xm0,      [r3   -32*4]
-    mova                xm1,      [r3   -32*3]
-    vinserti128          m0, m0,  [tmp1q-32*4], 1
-    vinserti128          m1, m1,  [tmp1q-32*3], 1
-    mova                xm2,      [r3   -32*2]
-    mova                xm3,      [r3   -32*1]
-    vinserti128          m2, m2,  [tmp1q-32*2], 1
-    vinserti128          m3, m3,  [tmp1q-32*1], 1
-    mova                xm4,      [r3   +32*0]
-    mova                xm5,      [r3   +32*1]
-    vinserti128          m4, m4,  [tmp1q+32*0], 1
-    vinserti128          m5, m5,  [tmp1q+32*1], 1
-    mova                xm6,      [r3   +32*2]
-    mova                xm7,      [r3   +32*3]
-    vinserti128          m6, m6,  [tmp1q+32*2], 1
-    vinserti128          m7, m7,  [tmp1q+32*3], 1
-    mova                xm8,      [r3   -32*4+16]
-    mova                xm9,      [r3   -32*3+16]
-    vinserti128          m8, m8,  [tmp1q-32*4+16], 1
-    vinserti128          m9, m9,  [tmp1q-32*3+16], 1
-    mova               xm10,      [r3   -32*2+16]
-    mova               xm11,      [r3   -32*1+16]
-    vinserti128         m10, m10, [tmp1q-32*2+16], 1
-    vinserti128         m11, m11, [tmp1q-32*1+16], 1
-    mova               xm12,      [r3   +32*0+16]
-    mova               xm13,      [r3   +32*1+16]
-    vinserti128         m12, m12, [tmp1q+32*0+16], 1
-    vinserti128         m13, m13, [tmp1q+32*1+16], 1
-    mova               xm14,      [r3   +32*2+16]
-    mova               xm15,      [r3   +32*3+16]
-    vinserti128         m14, m14, [tmp1q+32*2+16], 1
-    vinserti128         m15, m15, [tmp1q+32*3+16], 1
+    mova                xm0, [r3   -32*4]
+    mova                xm1, [r3   -32*3]
+    vinserti128          m0, [tmp1q-32*4], 1
+    vinserti128          m1, [tmp1q-32*3], 1
+    mova                xm2, [r3   -32*2]
+    mova                xm3, [r3   -32*1]
+    vinserti128          m2, [tmp1q-32*2], 1
+    vinserti128          m3, [tmp1q-32*1], 1
+    mova                xm4, [r3   +32*0]
+    mova                xm5, [r3   +32*1]
+    vinserti128          m4, [tmp1q+32*0], 1
+    vinserti128          m5, [tmp1q+32*1], 1
+    mova                xm6, [r3   +32*2]
+    mova                xm7, [r3   +32*3]
+    vinserti128          m6, [tmp1q+32*2], 1
+    vinserti128          m7, [tmp1q+32*3], 1
+    mova                xm8, [r3   -32*4+16]
+    mova                xm9, [r3   -32*3+16]
+    vinserti128          m8, [tmp1q-32*4+16], 1
+    vinserti128          m9, [tmp1q-32*3+16], 1
+    mova               xm10, [r3   -32*2+16]
+    mova               xm11, [r3   -32*1+16]
+    vinserti128         m10, [tmp1q-32*2+16], 1
+    vinserti128         m11, [tmp1q-32*1+16], 1
+    mova               xm12, [r3   +32*0+16]
+    mova               xm13, [r3   +32*1+16]
+    vinserti128         m12, [tmp1q+32*0+16], 1
+    vinserti128         m13, [tmp1q+32*1+16], 1
+    mova               xm14, [r3   +32*2+16]
+    mova               xm15, [r3   +32*3+16]
+    vinserti128         m14, [tmp1q+32*2+16], 1
+    vinserti128         m15, [tmp1q+32*3+16], 1
     mova         [rsp+32*0], m6
     mova         [rsp+32*1], m7
     vpbroadcastd         m7, [o(pw_8192)]
@@ -5810,48 +5320,48 @@ ALIGN function_align
     mov               tmp3d, 4
 .loop:
     lea               tmp2q, [tmp1q+32*8]
-    mova                xm0,      [tmp1q-32*4]
-    mova                xm1,      [tmp1q-32*3]
-    vinserti128          m0, m0,  [tmp2q-32*4], 1
-    vinserti128          m1, m1,  [tmp2q-32*3], 1
-    mova                xm2,      [tmp1q-32*2]
-    mova                xm3,      [tmp1q-32*1]
-    vinserti128          m2, m2,  [tmp2q-32*2], 1
-    vinserti128          m3, m3,  [tmp2q-32*1], 1
-    mova                xm4,      [tmp1q+32*0]
-    mova                xm5,      [tmp1q+32*1]
-    vinserti128          m4, m4,  [tmp2q+32*0], 1
-    vinserti128          m5, m5,  [tmp2q+32*1], 1
-    mova                xm6,      [tmp1q+32*2]
-    mova                xm7,      [tmp1q+32*3]
-    vinserti128          m6, m6,  [tmp2q+32*2], 1
-    vinserti128          m7, m7,  [tmp2q+32*3], 1
+    mova                xm0, [tmp1q-32*4]
+    mova                xm1, [tmp1q-32*3]
+    vinserti128          m0, [tmp2q-32*4], 1
+    vinserti128          m1, [tmp2q-32*3], 1
+    mova                xm2, [tmp1q-32*2]
+    mova                xm3, [tmp1q-32*1]
+    vinserti128          m2, [tmp2q-32*2], 1
+    vinserti128          m3, [tmp2q-32*1], 1
+    mova                xm4, [tmp1q+32*0]
+    mova                xm5, [tmp1q+32*1]
+    vinserti128          m4, [tmp2q+32*0], 1
+    vinserti128          m5, [tmp2q+32*1], 1
+    mova                xm6, [tmp1q+32*2]
+    mova                xm7, [tmp1q+32*3]
+    vinserti128          m6, [tmp2q+32*2], 1
+    vinserti128          m7, [tmp2q+32*3], 1
     REPX  {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
     call m(inv_txfm_add_identity_identity_8x32).transpose8x8
-    mova                xm8,      [tmp1q-32*4+16]
-    mova                xm9,      [tmp1q-32*3+16]
-    vinserti128          m8, m8,  [tmp2q-32*4+16], 1
-    vinserti128          m9, m9,  [tmp2q-32*3+16], 1
+    mova                xm8, [tmp1q-32*4+16]
+    mova                xm9, [tmp1q-32*3+16]
+    vinserti128          m8, [tmp2q-32*4+16], 1
+    vinserti128          m9, [tmp2q-32*3+16], 1
     mova       [tmp1q-32*4], m0
     mova       [tmp2q-32*4], m1
     mova       [tmp1q-32*3], m2
     mova       [tmp2q-32*3], m3
-    mova                xm2,     [tmp1q-32*2+16]
-    mova                xm3,     [tmp1q-32*1+16]
-    vinserti128          m2, m2, [tmp2q-32*2+16], 1
-    vinserti128          m3, m3, [tmp2q-32*1+16], 1
+    mova                xm2, [tmp1q-32*2+16]
+    mova                xm3, [tmp1q-32*1+16]
+    vinserti128          m2, [tmp2q-32*2+16], 1
+    vinserti128          m3, [tmp2q-32*1+16], 1
     mova       [tmp1q-32*2], m4
     mova       [tmp2q-32*2], m5
     mova       [tmp1q-32*1], m6
     mova       [tmp2q-32*1], m7
-    mova                xm4,     [tmp1q+32*0+16]
-    mova                xm5,     [tmp1q+32*1+16]
-    vinserti128          m4, m4, [tmp2q+32*0+16], 1
-    vinserti128          m5, m5, [tmp2q+32*1+16], 1
-    mova                xm6,     [tmp1q+32*2+16]
-    mova                xm7,     [tmp1q+32*3+16]
-    vinserti128          m6, m6, [tmp2q+32*2+16], 1
-    vinserti128          m7, m7, [tmp2q+32*3+16], 1
+    mova                xm4, [tmp1q+32*0+16]
+    mova                xm5, [tmp1q+32*1+16]
+    vinserti128          m4, [tmp2q+32*0+16], 1
+    vinserti128          m5, [tmp2q+32*1+16], 1
+    mova                xm6, [tmp1q+32*2+16]
+    mova                xm7, [tmp1q+32*3+16]
+    vinserti128          m6, [tmp2q+32*2+16], 1
+    vinserti128          m7, [tmp2q+32*3+16], 1
     pmulhrsw             m0, m8, m10
     pmulhrsw             m1, m9, m10
     REPX  {pmulhrsw x, m10}, m2, m3, m4, m5, m6, m7
diff --git a/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm
index 9316981a5..91cf666b9 100644
--- a/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/itx_ssse3.asm
@@ -139,11 +139,6 @@ pw_2675x8:      times 8 dw  2675*8
 pw_4085x8:      times 8 dw  4085*8
 pw_m301x8:      times 8 dw  -301*8
 
-iadst4_dconly1a: times 2 dw 10568, 19856, 26752, 30424
-iadst4_dconly1b: times 2 dw 30424, 26752, 19856, 10568
-iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
-iadst4_dconly2b: dw 26752, 26752, 26752, 26752, 30424, 30424, 30424, 30424
-
 SECTION .text
 
 %macro REPX 2-*
@@ -243,31 +238,24 @@ SECTION .text
     paddsw               m0, m2                ;high: out1 ;low: out0
 %endmacro
 
-%macro INV_TXFM_FN 5+ ; type1, type2, fast_thresh, size, xmm/stack
-cglobal inv_txfm_add_%1_%2_%4, 4, 6, %5, dst, stride, coeff, eob, tx2
-    %undef cmp
-    %define %%p1 m(i%1_%4_internal)
+%macro INV_TXFM_FN 4+ ; type1, type2, size, xmm/stack
+cglobal inv_txfm_add_%1_%2_%3, 4, 6, %4, dst, stride, coeff, eob, tx2
+    %define %%p1 m(i%1_%3_internal)
 %if ARCH_X86_32
     LEA                    r5, $$
 %endif
 %if has_epilogue
-%if %3 > 0
-    cmp                  eobd, %3
-    jle %%end
-%elif %3 == 0
+%ifidn %1_%2, dct_dct
     test                 eobd, eobd
     jz %%end
 %endif
-    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
     call %%p1
     RET
 %%end:
 %else
-    lea                  tx2q, [o(m(i%2_%4_internal).pass2)]
-%if %3 > 0
-    cmp                  eobd, %3
-    jg %%p1
-%elif %3 == 0
+    lea                  tx2q, [o(m(i%2_%3_internal).pass2)]
+%ifidn %1_%2, dct_dct
     test                 eobd, eobd
     jnz %%p1
 %else
@@ -278,63 +266,26 @@ ALIGN function_align
 %endif
 %endmacro
 
-%macro INV_TXFM_4X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x4, 6
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [coeffq]
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    TAIL_CALL m(iadst_4x4_internal).end
-%elifidn %1_%2, identity_dct
-    mova                 m1, [coeffq+16*0]
-    mova                 m2, [coeffq+16*1]
-    punpcklwd            m0, m1, m2
-    punpckhwd            m1, m2
-    punpcklwd            m0, m1
-    punpcklqdq           m0, m0
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    pmulhrsw             m0, [o(pw_2896x8)]
-    mova                 m1, m0
-    TAIL_CALL m(iadst_4x4_internal).end
-%elif %3 >= 0
+%macro INV_TXFM_4X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x4, 6
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
-%ifidn %1, dct
     mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
-%elifidn %1, adst
-    pmulhrsw             m0, [o(iadst4_dconly1a)]
-%elifidn %1, flipadst
-    pmulhrsw             m0, [o(iadst4_dconly1b)]
-%endif
     mov            [coeffq], eobd                ;0
-%ifidn %2, dct
-%ifnidn %1, dct
-    pmulhrsw             m0, [o(pw_2896x8)]
-%else
     pmulhrsw             m0, m1
-%endif
     mova                 m1, m0
     TAIL_CALL m(iadst_4x4_internal).end2
-%else ; adst / flipadst
-    pmulhrsw             m1, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    TAIL_CALL m(i%2_4x4_internal).end2
-%endif
 %endif
 %endmacro
 
 INIT_XMM ssse3
 
-INV_TXFM_4X4_FN dct, dct,      0
-INV_TXFM_4X4_FN dct, adst,     0
-INV_TXFM_4X4_FN dct, flipadst, 0
-INV_TXFM_4X4_FN dct, identity, 3
+INV_TXFM_4X4_FN dct, dct
+INV_TXFM_4X4_FN dct, adst
+INV_TXFM_4X4_FN dct, flipadst
+INV_TXFM_4X4_FN dct, identity
 
 cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m0, [coeffq+16*0]      ;high: in1 ;low: in0
@@ -358,9 +309,9 @@ cglobal idct_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 
     ITX4_END     0, 1, 3, 2
 
-INV_TXFM_4X4_FN adst, dct,      0
-INV_TXFM_4X4_FN adst, adst,     0
-INV_TXFM_4X4_FN adst, flipadst, 0
+INV_TXFM_4X4_FN adst, dct
+INV_TXFM_4X4_FN adst, adst
+INV_TXFM_4X4_FN adst, flipadst
 INV_TXFM_4X4_FN adst, identity
 
 cglobal iadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -410,9 +361,9 @@ ALIGN function_align
     packssdw             m1, m2                    ;high: out3 ;low: out3
     ret
 
-INV_TXFM_4X4_FN flipadst, dct,      0
-INV_TXFM_4X4_FN flipadst, adst,     0
-INV_TXFM_4X4_FN flipadst, flipadst, 0
+INV_TXFM_4X4_FN flipadst, dct
+INV_TXFM_4X4_FN flipadst, adst
+INV_TXFM_4X4_FN flipadst, flipadst
 INV_TXFM_4X4_FN flipadst, identity
 
 cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
@@ -436,7 +387,7 @@ cglobal iflipadst_4x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
 .end2:
     ITX4_END              3, 2, 1, 0
 
-INV_TXFM_4X4_FN identity, dct,      3
+INV_TXFM_4X4_FN identity, dct
 INV_TXFM_4X4_FN identity, adst
 INV_TXFM_4X4_FN identity, flipadst
 INV_TXFM_4X4_FN identity, identity
@@ -595,39 +546,9 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
     punpckhdq            m3, m4                      ;low: in6 high: in7
 %endmacro
 
-%macro INV_TXFM_4X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x8, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m1, [o(pw_2896x8)]
-    pmulhrsw             m0, m1, [coeffq]
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, [o(pw_4096)]
-    punpckhwd            m2, m0, m0
-    punpcklwd            m0, m0
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, identity_dct
-    movd                 m0, [coeffq+16*0]
-    punpcklwd            m0, [coeffq+16*1]
-    movd                 m1, [coeffq+16*2]
-    punpcklwd            m1, [coeffq+16*3]
-    mova                 m2, [o(pw_2896x8)]
-    punpckldq            m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m0, [o(pw_1697x8)]
-    paddsw               m0, m1
-    pmulhrsw             m0, m2
-    pmulhrsw             m0, [o(pw_2048)]
-    punpcklqdq           m0, m0
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x8, 8
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
     mova                 m1, [o(pw_2896x8)]
@@ -639,32 +560,14 @@ cglobal inv_txfm_add_wht_wht_4x4, 3, 3, 4, dst, stride, coeff
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end4
-%else ; adst_dct / flipadst_dct
-    pshuflw              m0, [coeffq], q0000
-    punpcklqdq           m0, m0
-    mova                 m1, [o(pw_2896x8)]
-    pmulhrsw             m0, m1
-%ifidn %1, adst
-    pmulhrsw             m0, [o(iadst4_dconly1a)]
-%else ; flipadst
-    pmulhrsw             m0, [o(iadst4_dconly1b)]
-%endif
-    mov            [coeffq], eobd
-    pmulhrsw             m0, m1
-    pmulhrsw             m0, [o(pw_2048)]
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-    TAIL_CALL m(iadst_4x8_internal).end4
-%endif
+    TAIL_CALL m(iadst_4x8_internal).end3
 %endif
 %endmacro
 
-INV_TXFM_4X8_FN dct, dct,      0
-INV_TXFM_4X8_FN dct, identity, 7
+INV_TXFM_4X8_FN dct, dct
 INV_TXFM_4X8_FN dct, adst
 INV_TXFM_4X8_FN dct, flipadst
+INV_TXFM_4X8_FN dct, identity
 
 cglobal idct_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m3, [o(pw_2896x8)]
@@ -690,7 +593,7 @@ ALIGN function_align
     ret
 
 
-INV_TXFM_4X8_FN adst, dct,      0
+INV_TXFM_4X8_FN adst, dct
 INV_TXFM_4X8_FN adst, adst
 INV_TXFM_4X8_FN adst, flipadst
 INV_TXFM_4X8_FN adst, identity
@@ -725,15 +628,13 @@ cglobal iadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     pmulhrsw             m1, m4
     pmulhrsw             m2, m4
     pmulhrsw             m3, m4
-
-.end3:
     pxor                 m5, m5
     mova      [coeffq+16*0], m5
     mova      [coeffq+16*1], m5
     mova      [coeffq+16*2], m5
     mova      [coeffq+16*3], m5
 
-.end4:
+.end3:
     WRITE_4X8             0, 1, 2, 3
     RET
 
@@ -783,7 +684,7 @@ ALIGN function_align
     packssdw             m2, m4                    ;low:  out4  high: -out5
     ret
 
-INV_TXFM_4X8_FN flipadst, dct,      0
+INV_TXFM_4X8_FN flipadst, dct
 INV_TXFM_4X8_FN flipadst, adst
 INV_TXFM_4X8_FN flipadst, flipadst
 INV_TXFM_4X8_FN flipadst, identity
@@ -824,7 +725,7 @@ cglobal iflipadst_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     psubw                m4, m5
     jmp m(iadst_4x8_internal).end
 
-INV_TXFM_4X8_FN identity, dct,      3
+INV_TXFM_4X8_FN identity, dct
 INV_TXFM_4X8_FN identity, adst
 INV_TXFM_4X8_FN identity, flipadst
 INV_TXFM_4X8_FN identity, identity
@@ -881,84 +782,28 @@ cglobal iidentity_4x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     WRITE_8X2             %3, %4, %5, %6, %7
 %endmacro
 
-%macro INV_TXFM_8X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m1, m0, [coeffq]
-    pmulhrsw             m1, m0
-    pmulhrsw             m0, m1, [o(pw_1697x8)]
-    paddsw               m1, m0
-    pmulhrsw             m1, [o(pw_2048)]
-    punpcklwd            m1, m1
-    punpckhdq            m2, m1, m1
-    punpckldq            m1, m1
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    punpckldq            m0, m1, m1
-    punpckhdq            m1, m1
-%elifidn %1_%2, identity_dct
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    mova                 m2, [coeffq+16*2]
-    mova                 m3, [coeffq+16*3]
-    punpckhwd            m4, m0, m1
-    punpcklwd            m0, m1
-    punpckhwd            m5, m2, m3
-    punpcklwd            m2, m3
-    punpcklwd            m0, m4
-    punpcklwd            m2, m5
-    punpcklqdq           m0, m2
-    mova                 m4, [o(pw_2896x8)]
-    pmulhrsw             m0, m4
-    paddsw               m0, m0
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, [o(pw_2048)]
-    mova                 m1, m0
-    mova                 m2, m0
-    mova                 m3, m0
-%else
+%macro INV_TXFM_8X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x4, 8
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklqdq           m0, m0
     mova                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1
     pmulhrsw             m0, m1
-%ifidn %2, dct
     mova                 m2, [o(pw_2048)]
     pmulhrsw             m0, m1
     pmulhrsw             m0, m2
     mova                 m1, m0
     mova                 m2, m0
     mova                 m3, m0
-%else ; adst / flipadst
-    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    mova                 m1, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m1
-%ifidn %2, adst
-    punpckhqdq           m1, m0, m0
-    punpcklqdq           m0, m0
-    punpckhqdq           m3, m2, m2
-    punpcklqdq           m2, m2
-%else ; flipadst
-    mova                 m3, m0
-    punpckhqdq           m0, m2, m2
-    punpcklqdq           m1, m2, m2
-    punpckhqdq           m2, m3, m3
-    punpcklqdq           m3, m3
-%endif
-%endif
-%endif
     TAIL_CALL m(iadst_8x4_internal).end2
 %endif
 %endmacro
 
-INV_TXFM_8X4_FN dct, dct,      0
-INV_TXFM_8X4_FN dct, adst,     0
-INV_TXFM_8X4_FN dct, flipadst, 0
-INV_TXFM_8X4_FN dct, identity, 3
+INV_TXFM_8X4_FN dct, dct
+INV_TXFM_8X4_FN dct, adst
+INV_TXFM_8X4_FN dct, flipadst
+INV_TXFM_8X4_FN dct, identity
 
 cglobal idct_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m3, [o(pw_2896x8)]
@@ -1157,7 +1002,7 @@ cglobal iflipadst_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 m3, m4
     jmp m(iadst_8x4_internal).end
 
-INV_TXFM_8X4_FN identity, dct,      7
+INV_TXFM_8X4_FN identity, dct
 INV_TXFM_8X4_FN identity, adst
 INV_TXFM_8X4_FN identity, flipadst
 INV_TXFM_8X4_FN identity, identity
@@ -1199,30 +1044,9 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     paddsw               m3, m7
     jmp m(iadst_8x4_internal).end
 
-%macro INV_TXFM_8X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x8, 8, 16*4
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [coeffq]
-    mova                 m1, [o(pw_16384)]
-    pmulhrsw             m0, m1
-    psrlw                m1, 2
-    pmulhrsw             m0, m1
-    punpckhwd            m7, m0, m0
-    punpcklwd            m0, m0
-    pshufd               m3, m0, q3333
-    pshufd               m2, m0, q2222
-    pshufd               m1, m0, q1111
-    pshufd               m0, m0, q0000
-    call m(iadst_8x4_internal).end2
-    pshufd               m3, m7, q3333
-    pshufd               m2, m7, q2222
-    pshufd               m1, m7, q1111
-    pshufd               m0, m7, q0000
-    lea                dstq, [dstq+strideq*2]
-    TAIL_CALL m(iadst_8x4_internal).end3
-%elif %3 >= 0
-%ifidn %1, dct
+%macro INV_TXFM_8X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x8, 8, 16*4
+%ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklwd            m0, m0
     mova                 m1, [o(pw_2896x8)]
@@ -1244,24 +1068,6 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                tx2q
 .end3:
     RET
-%else ; identity
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    mova                 m2, [coeffq+16*2]
-    mova                 m3, [coeffq+16*3]
-    punpcklwd            m0, [coeffq+16*4]
-    punpcklwd            m1, [coeffq+16*5]
-    punpcklwd            m2, [coeffq+16*6]
-    punpcklwd            m3, [coeffq+16*7]
-    punpcklwd            m0, m2
-    punpcklwd            m1, m3
-    punpcklwd            m0, m1
-    pmulhrsw             m0, [o(pw_2896x8)]
-    pmulhrsw             m0, [o(pw_2048)]
-    pxor                 m4, m4
-    REPX {mova [coeffq+16*x], m4}, 0,  1,  2,  3,  4,  5,  6,  7
-    jmp m(inv_txfm_add_dct_dct_8x8).end
-%endif
 %endif
 %endmacro
 
@@ -1298,10 +1104,10 @@ cglobal iidentity_8x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     ITX_MULSUB_2W         %2, %5, %3, %6, %7, 2896, 2896, 1 ;t5, t6
 %endmacro
 
-INV_TXFM_8X8_FN dct, dct,      0
-INV_TXFM_8X8_FN dct, identity, 7
+INV_TXFM_8X8_FN dct, dct
 INV_TXFM_8X8_FN dct, adst
 INV_TXFM_8X8_FN dct, flipadst
+INV_TXFM_8X8_FN dct, identity
 
 cglobal idct_8x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS          coeffq, 16
@@ -1610,7 +1416,7 @@ ALIGN function_align
     mova    [rsp+gprsize+16*0], m7
     jmp m(idct_8x8_internal).end3
 
-INV_TXFM_8X8_FN identity, dct,      7
+INV_TXFM_8X8_FN identity, dct
 INV_TXFM_8X8_FN identity, adst
 INV_TXFM_8X8_FN identity, flipadst
 INV_TXFM_8X8_FN identity, identity
@@ -1634,58 +1440,9 @@ ALIGN function_align
     jmp m(idct_8x8_internal).end3
 
 
-%macro INV_TXFM_4X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 4x16, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m0, [o(pw_2896x8)]
-    mova                 m1, m0
-    pmulhrsw             m0, [coeffq+16*0]
-    pmulhrsw             m1, [coeffq+16*1]
-    mova                 m2, [o(pw_16384)]
-    mova                 m3, [o(pw_1697x16)]
-    mova                 m4, [o(pw_2048)]
-    pmulhrsw             m0, m2
-    pmulhrsw             m1, m2
-    pmulhrsw             m2, m3, m0
-    pmulhrsw             m3, m1
-    paddsw               m0, m0
-    paddsw               m1, m1
-    paddsw               m0, m2
-    paddsw               m1, m3
-    pmulhrsw             m0, m4
-    pmulhrsw             m4, m1
-    punpckhwd            m2, m0, m0
-    punpcklwd            m0, m0
-    punpckhwd            m6, m4, m4
-    punpcklwd            m4, m4
-    punpckhdq            m1, m0, m0
-    punpckldq            m0, m0
-    punpckhdq            m3, m2, m2
-    punpckldq            m2, m2
-    punpckhdq            m5, m4, m4
-    punpckldq            m4, m4
-    punpckhdq            m7, m6, m6
-    punpckldq            m6, m6
-    mova      [coeffq+16*4], m4
-    TAIL_CALL m(iadst_4x16_internal).end2
-%elifidn %1_%2, identity_dct
-    movd                  m0, [coeffq+32*0]
-    punpcklwd             m0, [coeffq+32*1]
-    movd                  m1, [coeffq+32*2]
-    punpcklwd             m1, [coeffq+32*3]
-    punpckldq             m0, m1
-    pmulhrsw              m1, m0, [o(pw_1697x8)]
-    pcmpeqw               m2, m2
-    pcmpeqw               m2, m0
-    pxor                  m0, m2
-    pavgw                 m0, m1
-    pmulhrsw              m0, [o(pw_2896x8)]
-    pmulhrsw              m0, [o(pw_2048)]
-    punpcklqdq            m0, m0
-    pxor                  m1, m1
-    REPX     {mova [coeffq+32*x], m1}, 0,  1,  2,  3
-%elifidn %1_%2, dct_dct
+%macro INV_TXFM_4X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 4x16, 8
+%ifidn %1_%2, dct_dct
     pshuflw               m0, [coeffq], q0000
     punpcklwd             m0, m0
     mova                  m1, [o(pw_2896x8)]
@@ -1694,21 +1451,6 @@ ALIGN function_align
     pmulhrsw              m0, [o(pw_16384)]
     pmulhrsw              m0, m1
     pmulhrsw              m0, [o(pw_2048)]
-%else ; adst_dct / flipadst_dct
-    pshuflw               m0, [coeffq], q0000
-    punpcklwd             m0, m0
-%ifidn %1, adst
-    pmulhrsw              m0, [o(iadst4_dconly1a)]
-%else ; flipadst
-    pmulhrsw              m0, [o(iadst4_dconly1b)]
-%endif
-    mova                  m1, [o(pw_16384)]
-    mov             [coeffq], eobd
-    pmulhrsw              m0, m1
-    psrlw                 m1, 3                ; pw_2048
-    pmulhrsw              m0, [o(pw_2896x8)]
-    pmulhrsw              m0, m1
-%endif
 .end:
     WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
     lea                dstq, [dstq+strideq*4]
@@ -1721,10 +1463,10 @@ ALIGN function_align
 %endif
 %endmacro
 
-INV_TXFM_4X16_FN dct, dct,      0
-INV_TXFM_4X16_FN dct, identity, 15
+INV_TXFM_4X16_FN dct, dct
 INV_TXFM_4X16_FN dct, adst
 INV_TXFM_4X16_FN dct, flipadst
+INV_TXFM_4X16_FN dct, identity
 
 cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                  r3, [o(m(idct_4x8_internal).pass1)]
@@ -1790,7 +1532,7 @@ cglobal idct_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
     ret
 
-INV_TXFM_4X16_FN adst, dct,      0
+INV_TXFM_4X16_FN adst, dct
 INV_TXFM_4X16_FN adst, adst
 INV_TXFM_4X16_FN adst, flipadst
 INV_TXFM_4X16_FN adst, identity
@@ -1858,7 +1600,7 @@ cglobal iadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     ret
 
 
-INV_TXFM_4X16_FN flipadst, dct,      0
+INV_TXFM_4X16_FN flipadst, dct
 INV_TXFM_4X16_FN flipadst, adst
 INV_TXFM_4X16_FN flipadst, flipadst
 INV_TXFM_4X16_FN flipadst, identity
@@ -1888,7 +1630,7 @@ cglobal iflipadst_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(iadst_4x16_internal).end1
 
 
-INV_TXFM_4X16_FN identity, dct,      3
+INV_TXFM_4X16_FN identity, dct
 INV_TXFM_4X16_FN identity, adst
 INV_TXFM_4X16_FN identity, flipadst
 INV_TXFM_4X16_FN identity, identity
@@ -1964,68 +1706,11 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iadst_4x16_internal).end2
 
 
-%macro INV_TXFM_16X4_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x4, 8
-%if %3 >= 0
-%ifidn %1_%2, dct_identity
-    mova                 m3, [o(pw_2896x8)]
-    pmulhrsw             m3, [coeffq]
-    mova                 m0, [o(pw_16384)]
-    pmulhrsw             m3, m0
-    psrlw                m0, 3                ; pw_2048
-    pmulhrsw             m1, m3, [o(pw_1697x8)]
-    paddsw               m3, m1
-    pmulhrsw             m3, m0
-    punpcklwd            m3, m3
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                 m4, [o(pw_1697x16)]
-    mova                 m5, [o(pw_16384)]
-    mova                 m6, [o(pw_2896x8)]
-    mov                 r3d, 2
-    psrlw                m7, m5, 3 ; pw_2048
-.main_loop:
-    mova                 m0, [coeffq+16*0]
-    mova                 m1, [coeffq+16*1]
-    punpckhwd            m2, m0, m1
-    punpcklwd            m0, m1
-    punpcklwd            m0, m2
-    mova                 m1, [coeffq+16*2]
-    mova                 m2, [coeffq+16*3]
-    punpckhwd            m3, m1, m2
-    punpcklwd            m1, m2
-    punpcklwd            m1, m3
-    punpcklqdq           m0, m1
-    pmulhrsw             m1, m4, m0
-    pmulhrsw             m1, m5
-    paddsw               m0, m1
-    pmulhrsw             m0, m6
-    pmulhrsw             m0, m7
-.end:
-    pxor                 m3, m3
-    mova      [coeffq+16*0], m3
-    mova      [coeffq+16*1], m3
-    mova      [coeffq+16*2], m3
-    mova      [coeffq+16*3], m3
-    add              coeffq, 16*4
-    lea                tx2q, [dstq+8]
-    WRITE_8X4            0, 0, 0, 0, 1, 2, 3
-    mov                dstq, tx2q
-    dec                 r3d
-    jg .main_loop
-    RET
-%else
+%macro INV_TXFM_16X4_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x4, 8
+%ifidn %1_%2, dct_dct
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
-%ifidn %2, dct
     movd                 m2, [o(pw_16384)]
     mov            [coeffq], eobd
     mov                 r2d, 2
@@ -2059,35 +1744,6 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                tx2q
 .end:
     RET
-%else ; adst / flipadst
-    movd                 m2, [o(pw_16384)]
-    pmulhrsw             m0, m2
-    pshuflw              m0, m0, q0000
-    punpcklwd            m0, m0
-    mov            [coeffq], eobd
-    pmulhrsw             m2, m0, [o(iadst4_dconly2b)]
-    pmulhrsw             m0, [o(iadst4_dconly2a)]
-    mova                 m1, [o(pw_2048)]
-    pmulhrsw             m0, m1
-    pmulhrsw             m2, m1
-%ifidn %2, adst
-    punpckhqdq           m1, m0, m0
-    punpcklqdq           m0, m0
-    punpckhqdq           m3, m2, m2
-    punpcklqdq           m2, m2
-%else ; flipadst
-    mova                 m3, m0
-    punpckhqdq           m0, m2, m2
-    punpcklqdq           m1, m2, m2
-    punpckhqdq           m2, m3, m3
-    punpcklqdq           m3, m3
-%endif
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end3
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end3
-%endif
-%endif
 %endif
 %endmacro
 
@@ -2144,10 +1800,10 @@ cglobal iidentity_4x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     punpcklqdq           m%1, m%6                      ;low: t8a   high: t9
 %endmacro
 
-INV_TXFM_16X4_FN dct, dct,      0
-INV_TXFM_16X4_FN dct, adst,     0
-INV_TXFM_16X4_FN dct, flipadst, 0
-INV_TXFM_16X4_FN dct, identity, 3
+INV_TXFM_16X4_FN dct, dct
+INV_TXFM_16X4_FN dct, adst
+INV_TXFM_16X4_FN dct, flipadst
+INV_TXFM_16X4_FN dct, identity
 
 cglobal idct_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_7ROWS        coeffq, 16
@@ -2464,7 +2120,7 @@ cglobal iflipadst_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_16x4_internal).pass2_end
 
 
-INV_TXFM_16X4_FN identity, dct,      15
+INV_TXFM_16X4_FN identity, dct
 INV_TXFM_16X4_FN identity, adst
 INV_TXFM_16X4_FN identity, flipadst
 INV_TXFM_16X4_FN identity, identity
@@ -2537,8 +2193,8 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                 [%1+%2*7], m7
 %endmacro
 
-%macro INV_TXFM_8X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 8x16, 8, 16*16
+%macro INV_TXFM_8X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 8x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     pshuflw              m0, [coeffq], q0000
     punpcklwd            m0, m0
@@ -2556,78 +2212,13 @@ cglobal iidentity_16x4_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(inv_txfm_add_dct_dct_8x8).loop
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mov                 r3d, 2
-.loop:
-    mova                 m0, [o(pw_2896x8)]
-    pmulhrsw             m7, m0, [coeffq]
-    mova                 m1, [o(pw_16384)]
-    pxor                 m2, m2
-    mova           [coeffq], m2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 3          ; pw_2048
-    pmulhrsw             m0, m7, [o(pw_1697x16)]
-    paddsw               m7, m7
-    paddsw               m7, m0
-    pmulhrsw             m7, m1
-    punpcklwd            m0, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m3, m0, q3333
-    pshufd               m2, m0, q2222
-    pshufd               m1, m0, q1111
-    pshufd               m0, m0, q0000
-    call m(iadst_8x4_internal).end3
-    pshufd               m3, m7, q3333
-    pshufd               m2, m7, q2222
-    pshufd               m1, m7, q1111
-    pshufd               m0, m7, q0000
-    lea                dstq, [dstq+strideq*2]
-    call m(iadst_8x4_internal).end3
-
-    add              coeffq, 16
-    lea                dstq, [dstq+strideq*2]
-    dec                 r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    movd                 m0, [coeffq+32*0]
-    punpcklwd            m0, [coeffq+32*1]
-    movd                 m2, [coeffq+32*2]
-    punpcklwd            m2, [coeffq+32*3]
-    add              coeffq, 32*4
-    movd                 m1, [coeffq+32*0]
-    punpcklwd            m1, [coeffq+32*1]
-    movd                 m3, [coeffq+32*2]
-    punpcklwd            m3, [coeffq+32*3]
-    mova                 m4, [o(pw_2896x8)]
-    xor                eobd, eobd
-    mov       [coeffq-32*4], eobd
-    mov       [coeffq-32*3], eobd
-    mov       [coeffq-32*2], eobd
-    mov       [coeffq-32*1], eobd
-    punpckldq            m0, m2
-    punpckldq            m1, m3
-    punpcklqdq           m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, [o(pw_2048)]
-    mov       [coeffq+32*0], eobd
-    mov       [coeffq+32*1], eobd
-    mov       [coeffq+32*2], eobd
-    mov       [coeffq+32*3], eobd
-    mov                 r3d, 4
-    lea                tx2q, [o(m(inv_txfm_add_identity_dct_8x16).end)]
-    jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
-    RET
 %endif
 %endmacro
 
-INV_TXFM_8X16_FN dct, dct,      0
-INV_TXFM_8X16_FN dct, identity, 15
+INV_TXFM_8X16_FN dct, dct
 INV_TXFM_8X16_FN dct, adst
 INV_TXFM_8X16_FN dct, flipadst
+INV_TXFM_8X16_FN dct, identity
 
 cglobal idct_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                    r3, [o(m(idct_8x8_internal).pass1)]
@@ -2790,7 +2381,7 @@ cglobal iflipadst_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(iflipadst_8x8_internal).end
 
 
-INV_TXFM_8X16_FN identity, dct,      7
+INV_TXFM_8X16_FN identity, dct
 INV_TXFM_8X16_FN identity, adst
 INV_TXFM_8X16_FN identity, flipadst
 INV_TXFM_8X16_FN identity, identity
@@ -2837,8 +2428,8 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp .end
 
 
-%macro INV_TXFM_16X8_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x8, 8, 16*16
+%macro INV_TXFM_16X8_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x8, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                 m1, [o(pw_2896x8)]
     pmulhrsw             m0, m1, [coeffq]
@@ -2850,83 +2441,13 @@ cglobal iidentity_8x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mova                 m7, [coeffq]
-    mova                 m0, [o(pw_2896x8)]
-    mova                 m1, [o(pw_16384)]
-    pxor                 m2, m2
-    mova           [coeffq], m2
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m0
-    pmulhrsw             m7, m1
-    psrlw                m1, 2               ; pw_4096
-    pmulhrsw             m7, m1
-    punpcklwd            m3, m7, m7
-    punpckhwd            m7, m7
-    pshufd               m0, m3, q0000
-    pshufd               m1, m3, q1111
-    pshufd               m2, m3, q2222
-    pshufd               m3, m3, q3333
-    lea                  r3, [dstq+strideq*4]
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    call m(iadst_8x4_internal).end2
-    mov                dstq, r3
-    add              coeffq, 16*4
-    pshufd               m0, m7, q0000
-    pshufd               m1, m7, q1111
-    pshufd               m2, m7, q2222
-    pshufd               m3, m7, q3333
-    lea                tx2q, [dstq+8]
-    call m(iadst_8x4_internal).end2
-    add              coeffq, 16*4
-    mov                dstq, tx2q
-    TAIL_CALL m(iadst_8x4_internal).end2
-%elifidn %1_%2, identity_dct
-    mova                 m4, [o(pw_2896x8)]
-    mova                 m5, [o(pw_1697x16)]
-    mova                 m6, [o(pw_16384)]
-    psrlw                m7, m6, 3 ; pw_2048
-    mov                 r3d, 2
-.main_loop:
-    mova                 m0, [coeffq+16*0]
-    punpcklwd            m0, [coeffq+16*1]
-    mova                 m1, [coeffq+16*2]
-    punpcklwd            m1, [coeffq+16*3]
-    punpckldq            m0, m1
-    mova                 m1, [coeffq+16*4]
-    punpcklwd            m1, [coeffq+16*5]
-    mova                 m2, [coeffq+16*6]
-    punpcklwd            m2, [coeffq+16*7]
-    punpckldq            m1, m2
-    punpcklqdq           m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m1, m5, m0
-    pmulhrsw             m1, m6
-    paddsw               m0, m1
-    pmulhrsw             m0, m4
-    pmulhrsw             m0, m7
-.end:
-    pxor                 m1, m1
-    REPX {mova [coeffq+16*x], m1}, 0, 1, 2, 3, 4, 5, 6, 7
-    add              coeffq, 16*8
-    lea                tx2q, [dstq+8]
-    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
-    lea                dstq, [dstq+strideq*2]
-    WRITE_8X4             0, 0, 0, 0, 1, 2, 3
-    mov                dstq, tx2q
-    dec                 r3d
-    jg .main_loop
-    RET
 %endif
 %endmacro
 
-INV_TXFM_16X8_FN dct, dct,      0
-INV_TXFM_16X8_FN dct, identity, 7
+INV_TXFM_16X8_FN dct, dct
 INV_TXFM_16X8_FN dct, adst
 INV_TXFM_16X8_FN dct, flipadst
+INV_TXFM_16X8_FN dct, identity
 
 cglobal idct_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*0, 32, 1
@@ -3382,7 +2903,7 @@ cglobal iflipadst_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iflipadst_8x8_internal).pass2_main
 
 
-INV_TXFM_16X8_FN identity, dct,      15
+INV_TXFM_16X8_FN identity, dct
 INV_TXFM_16X8_FN identity, adst
 INV_TXFM_16X8_FN identity, flipadst
 INV_TXFM_16X8_FN identity, identity
@@ -3463,8 +2984,8 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(iidentity_8x8_internal).end
 
 
-%macro INV_TXFM_16X16_FN 2-3 -1 ; type1, type2, fast_thresh
-    INV_TXFM_FN          %1, %2, %3, 16x16, 8, 16*16
+%macro INV_TXFM_16X16_FN 2 ; type1, type2
+    INV_TXFM_FN          %1, %2, 16x16, 8, 16*16
 %ifidn %1_%2, dct_dct
     movd                   m1, [o(pw_2896x8)]
     pmulhrsw               m0, m1, [coeffq]
@@ -3475,104 +2996,13 @@ cglobal iidentity_16x8_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(inv_txfm_add_dct_dct_16x4).dconly
 .end:
     RET
-%elifidn %1_%2, dct_identity
-    mova                   m3, [o(pw_2896x8)]
-    pmulhrsw               m2, m3, [coeffq+16*0]
-    pmulhrsw               m3, [coeffq+16*1]
-    mova                   m0, [o(pw_8192)]
-    mova                   m1, [o(pw_1697x16)]
-    pshuflw                m4, [o(deint_shuf)], q0000 ;pb_0_1
-    punpcklwd              m4, m4
-    pcmpeqb                m5, m5
-    pxor                   m6, m6
-    mova        [coeffq+16*0], m6
-    mova        [coeffq+16*1], m6
-    paddb                  m5, m5                     ;pb_m2
-    pmulhrsw               m2, m0
-    pmulhrsw               m3, m0
-    psrlw                  m0, 2                      ;pw_2048
-    pmulhrsw               m7, m1, m2
-    pmulhrsw               m1, m3
-    paddsw                 m2, m2
-    paddsw                 m3, m3
-    paddsw                 m2, m7
-    paddsw                 m3, m1
-    pmulhrsw               m2, m0
-    pmulhrsw               m3, m0
-    mov                   r3d, 8
-.loop:
-    mova                   m1, [dstq]
-    pshufb                 m0, m2, m4
-    punpckhbw              m7, m1, m6
-    punpcklbw              m1, m6
-    paddw                  m7, m0
-    paddw                  m1, m0
-    packuswb               m1, m7
-    mova               [dstq], m1
-    mova                   m1, [dstq+strideq*8]
-    pshufb                 m0, m3, m4
-    psubb                  m4, m5 ; += 2
-    punpckhbw              m7, m1, m6
-    punpcklbw              m1, m6
-    paddw                  m7, m0
-    paddw                  m1, m0
-    packuswb               m1, m7
-    mova     [dstq+strideq*8], m1
-    add                  dstq, strideq
-    dec                   r3d
-    jg .loop
-    RET
-%elifidn %1_%2, identity_dct
-    mova                   m4, [o(pw_1697x16)]
-    mova                   m5, [o(pw_2896x8)]
-    mova                   m6, [o(pw_2048)]
-    xor                  eobd, eobd
-    lea                  tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end)]
-    lea                    r3, [dstq+8]
-    mov            [rsp+16*0], r3
-.main:
-    movd                   m0, [coeffq+32*0]
-    punpcklwd              m0, [coeffq+32*1]
-    movd                   m1, [coeffq+32*2]
-    punpcklwd              m1, [coeffq+32*3]
-    add                coeffq, 32*4
-    punpckldq              m0, m1
-    movd                   m1, [coeffq+32*0]
-    punpcklwd              m1, [coeffq+32*1]
-    movd                   m2, [coeffq+32*2]
-    punpcklwd              m2, [coeffq+32*3]
-    xor                  eobd, eobd
-    mov         [coeffq-32*4], eobd
-    mov         [coeffq-32*3], eobd
-    mov         [coeffq-32*2], eobd
-    mov         [coeffq-32*1], eobd
-    punpckldq              m1, m2
-    punpcklqdq             m0, m1
-    pmulhrsw               m1, m4, m0
-    psraw                  m1, 1
-    pavgw                  m0, m1
-    pmulhrsw               m0, m5
-    pmulhrsw               m0, m6
-    mov         [coeffq+32*0], eobd
-    mov         [coeffq+32*1], eobd
-    mov         [coeffq+32*2], eobd
-    mov         [coeffq+32*3], eobd
-    mov                   r3d, 4
-    jmp m(inv_txfm_add_dct_dct_8x8).loop
-.end:
-    lea                  tx2q, [o(m(inv_txfm_add_identity_dct_16x16).end1)]
-    add                coeffq, 32*4
-    mov                  dstq, [rsp+16*0]
-    jmp .main
-.end1:
-    RET
 %endif
 %endmacro
 
-INV_TXFM_16X16_FN dct, dct,      0
-INV_TXFM_16X16_FN dct, identity, 15
+INV_TXFM_16X16_FN dct, dct
 INV_TXFM_16X16_FN dct, adst
 INV_TXFM_16X16_FN dct, flipadst
+INV_TXFM_16X16_FN dct, identity
 
 cglobal idct_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*1, 64
@@ -3865,7 +3295,7 @@ cglobal iflipadst_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
     pavgw               m%1, m%2
 %endmacro
 
-INV_TXFM_16X16_FN identity, dct,      15
+INV_TXFM_16X16_FN identity, dct
 INV_TXFM_16X16_FN identity, identity
 
 cglobal iidentity_16x16_internal, 0, 0, 0, dst, stride, coeff, eob, tx2
diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm b/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm
index e2acae7b2..3e3c35c34 100644
--- a/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration.asm
@@ -51,9 +51,12 @@ cextern sgr_x_by_x
 SECTION .text
 
 INIT_YMM avx2
-cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
+cglobal wiener_filter_h, 5, 12, 16, dst, left, src, stride, fh, w, h, edge
+    mov        edged, edgem
     vpbroadcastb m15, [fhq+0]
+    movifnidn     wd, wm
     vpbroadcastb m14, [fhq+2]
+    mov           hd, hm
     vpbroadcastb m13, [fhq+4]
     vpbroadcastw m12, [fhq+6]
     vpbroadcastd m11, [pw_2048]
@@ -64,7 +67,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
 
     ; if (edge & has_right) align_w_to_32
     ; else w -= 32, and use that as limit in x loop
-    test       edged, 2 ; has_right
+    test       edgeb, 2 ; has_right
     jnz .align
     mov        xlimq, -3
     jmp .loop
@@ -80,7 +83,7 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
     lea           xq, [wq+xlimq]
 
     ; load left edge pixels
-    test       edged, 1 ; have_left
+    test       edgeb, 1 ; have_left
     jz .emu_left
     test       leftq, leftq ; left == NULL for the edge-extended bottom/top
     jz .load_left_combined
@@ -169,14 +172,21 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
     paddw         m2, m4
     paddw         m0, m6
     paddw         m2, m5
-    paddsw        m0, m8
+    ; for a signed overflow to happen we need filter and pixels as follow:
+    ; filter => -5,-23,-17,90,-17,-23,-5
+    ; pixels => 255,255,255,0,255,255,255 or 0,0,0,255,0,0,0
+    ; m0 would fall in the range [-59A6;+59A6] = [A65A;59A6]
+    ; m8 would fall in the range [-3FFC;+3F84] = [C004;3F84]
+    ;  32-bit arithmetic m0+m8 = [-99A2;+992A] = [FFFF665E;992A]
+    ; => signed 16-bit overflow occurs
+    paddsw        m0, m8  ; paddsw clips this range to [-8000;+7FFF]
     paddsw        m2, m3
-    psraw         m0, 3
+    psraw         m0, 3   ; shift changes the range to [-1000;+FFF]
     psraw         m2, 3
-    paddw         m0, m11
-    paddw         m2, m11
-    mova   [dstptrq], xm0
-    mova [dstptrq+16], xm2
+    paddw         m0, m11 ; adding back 800 (removed in m8) changes the
+    paddw         m2, m11 ; range to [-800;+17FF] as defined in the spec
+    mova   [dstptrq], xm0 ; (note that adding another 800 would give us
+    mova [dstptrq+16], xm2;  the same range as in the C code => [0;1FFF])
     vextracti128 [dstptrq+32], m0, 1
     vextracti128 [dstptrq+48], m2, 1
     vextracti128 xm0, m1, 1
@@ -196,17 +206,19 @@ cglobal wiener_filter_h, 8, 12, 16, dst, left, src, stride, fh, w, h, edge
     jg .loop
     RET
 
-cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
-    vpbroadcastd m14, [fvq+4]
-    vpbroadcastd m15, [fvq]
-    vpbroadcastd m13, [pw_0_128]
-    paddw        m14, m13
+cglobal wiener_filter_v, 4, 10, 13, dst, stride, mid, w, h, fv, edge
+    movifnidn    fvq, fvmp
+    mov        edged, edgem
+    movifnidn     hd, hm
+    vpbroadcastd m10, [fvq]
+    vpbroadcastd m11, [fvq+4]
+    vpbroadcastd  m0, [pw_0_128]
     vpbroadcastd m12, [pd_1024]
 
     DEFINE_ARGS dst, stride, mid, w, h, ylim, edge, y, mptr, dstptr
-    mov        ylimd, edged
-    and        ylimd, 8 ; have_bottom
-    shr        ylimd, 2
+    rorx       ylimd, edged, 2
+    paddw        m11, m0
+    and        ylimd, 2 ; have_bottom
     sub        ylimd, 3
 
     ; main x loop for vertical filter, does one column of 16 pixels
@@ -214,7 +226,7 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
     mova          m3, [midq] ; middle line
 
     ; load top pixels
-    test       edged, 4 ; have_top
+    test       edgeb, 4 ; have_top
     jz .emu_top
     mova          m0, [midq-384*4]
     mova          m2, [midq-384*2]
@@ -269,27 +281,28 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
     ; try to structure the loop so that the common case is evaluated fastest
     mova          m6, [mptrq+384*6]
 .loop:
-    paddw         m7, m0, m6
-    paddw         m8, m1, m5
-    paddw         m9, m2, m4
-    punpcklwd    m10, m7, m8
-    punpckhwd     m7, m8
-    punpcklwd    m11, m9, m3
-    punpckhwd     m9, m3
-    pmaddwd      m10, m15
-    pmaddwd       m7, m15
-    pmaddwd      m11, m14
-    pmaddwd       m9, m14
-    paddd        m10, m11
+    paddw         m0, m6
+    paddw         m7, m1, m5
+    paddw         m8, m2, m4
+    punpcklwd     m9, m0, m7
+    punpckhwd     m0, m7
+    punpcklwd     m7, m8, m3
+    punpckhwd     m8, m3
+    pmaddwd       m9, m10
+    pmaddwd       m0, m10
+    pmaddwd       m7, m11
+    pmaddwd       m8, m11
+    add        mptrq, 384*2
     paddd         m7, m9
-    paddd        m10, m12
+    paddd         m0, m8
     paddd         m7, m12
-    psrad        m10, 11
+    paddd         m0, m12
     psrad         m7, 11
-    packssdw     m10, m7
-    packuswb     m10, m10
-    vpermq       m10, m10, q3120
-    mova   [dstptrq], xm10
+    psrad         m0, 11
+    packssdw      m7, m0
+    vextracti128 xm0, m7, 1
+    packuswb     xm7, xm0
+    mova   [dstptrq], xm7
     ; shift pixels one position
     mova          m0, m1
     mova          m1, m2
@@ -298,51 +311,51 @@ cglobal wiener_filter_v, 7, 10, 16, dst, stride, mid, w, h, fv, edge
     mova          m4, m5
     mova          m5, m6
     add      dstptrq, strideq
-    add        mptrq, 384*2
     dec           yd
     jg .loop_load
     ; for the bottom pixels, continue using m6 (as extended edge)
     cmp           yd, ylimd
     jg .loop
-
-    add         dstq, 16
     add         midq, 32
+    add         dstq, 16
     sub           wd, 16
     jg .loop_x
     RET
 
 INIT_YMM avx2
-cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-    mov        xlimd, edged
+cglobal sgr_box3_h, 5, 11, 7, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        xlimd, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    mov        edged, xlimd
     and        xlimd, 2                             ; have_right
-    add           wd, xlimd
-    xor        xlimd, 2                             ; 2*!have_right
-    jnz .no_right
-    add           wd, 15
+    jz .no_right
+    add           wd, 2+15
     and           wd, ~15
 .no_right:
+    lea          r10, [pb_right_ext_mask+32]
+    xor        xlimd, 2                             ; 2*!have_right
     pxor          m1, m1
-    lea         srcq, [srcq+wq]
+    add         srcq, wq
     lea         sumq, [sumq+wq*2-2]
     lea       sumsqq, [sumsqq+wq*4-4]
     neg           wq
-    lea          r10, [pb_right_ext_mask+32]
 .loop_y:
     mov           xq, wq
 
     ; load left
-    test       edged, 1                             ; have_left
+    test       edgeb, 1                             ; have_left
     jz .no_left
     test       leftq, leftq
     jz .load_left_from_main
-    pinsrw       xm0, [leftq+2], 7
+    vpbroadcastw xm0, [leftq+2]
     add        leftq, 4
     jmp .expand_x
 .no_left:
     vpbroadcastb xm0, [srcq+xq]
     jmp .expand_x
 .load_left_from_main:
-    pinsrw       xm0, [srcq+xq-2], 7
+    vpbroadcastw xm0, [srcq+xq-2]
 .expand_x:
     punpckhbw    xm0, xm1
 
@@ -352,8 +365,8 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
 .partial_load_and_extend:
     vpbroadcastb  m3, [srcq-1]
     pmovzxbw      m2, [srcq+xq]
-    punpcklbw     m3, m1
     movu          m4, [r10+xq*2]
+    punpcklbw     m3, m1
     pand          m2, m4
     pandn         m4, m3
     por           m2, m4
@@ -373,22 +386,21 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
     punpcklwd     m5, m3, m2
     punpckhwd     m6, m3, m2
     paddw         m3, m4
-    punpcklwd     m7, m4, m1
+    punpcklwd     m0, m4, m1
     punpckhwd     m4, m1
     pmaddwd       m5, m5
     pmaddwd       m6, m6
-    pmaddwd       m7, m7
+    pmaddwd       m0, m0
     pmaddwd       m4, m4
-    paddd         m5, m7
-    paddd         m6, m4
     paddw         m3, m2
+    paddd         m5, m0
+    vextracti128 xm0, m2, 1
+    paddd         m6, m4
     movu [sumq+xq*2], m3
-    movu [sumsqq+xq*4+ 0], xm5
-    movu [sumsqq+xq*4+16], xm6
+    movu         [sumsqq+xq*4+ 0], xm5
+    movu         [sumsqq+xq*4+16], xm6
     vextracti128 [sumsqq+xq*4+32], m5, 1
     vextracti128 [sumsqq+xq*4+48], m6, 1
-
-    vextracti128 xm0, m2, 1
     add           xq, 16
 
     ; if x <= -16 we can reload more pixels
@@ -411,25 +423,25 @@ cglobal sgr_box3_h, 8, 11, 8, sumsq, sum, left, src, stride, w, h, edge, x, xlim
     RET
 
 INIT_YMM avx2
-cglobal sgr_box3_v, 5, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
     mov           xq, -2
-    mov        ylimd, edged
-    and        ylimd, 8                             ; have_bottom
-    shr        ylimd, 2
+    rorx       ylimd, edged, 2
+    and        ylimd, 2                             ; have_bottom
     sub        ylimd, 2                             ; -2 if have_bottom=0, else 0
 .loop_x:
     lea           yd, [hq+ylimq+2]
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
     lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
-    test       edged, 4                             ; have_top
+    test       edgeb, 4                             ; have_top
     jnz .load_top
     movu          m0, [sumsq_ptrq+(384+16)*4*1]
     movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
+    movu          m6, [sum_ptrq+(384+16)*2*1]
     mova          m2, m0
     mova          m3, m1
     mova          m4, m0
     mova          m5, m1
-    movu          m6, [sum_ptrq+(384+16)*2*1]
     mova          m7, m6
     mova          m8, m6
     jmp .loop_y_noload
@@ -543,8 +555,10 @@ cglobal sgr_calc_ab1, 4, 6, 11, a, b, w, h, s
     RET
 
 INIT_YMM avx2
-cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter1, 5, 13, 16, t, src, stride, a, b, w, h, \
                                        tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
     vpbroadcastd m15, [pw_16]
     xor           xd, xd
 .loop_x:
@@ -647,75 +661,83 @@ cglobal sgr_finish_filter1, 7, 13, 16, t, src, stride, a, b, w, h, \
     RET
 
 INIT_YMM avx2
-cglobal sgr_weighted1, 6, 6, 7, dst, stride, t, w, h, wt
-    movd         xm0, wtd
-    vpbroadcastw  m0, xm0
-    psllw         m0, 4
+cglobal sgr_weighted1, 4, 6, 6, dst, stride, t, w, h, wt
+%ifidn wtd, wtm
+    shl          wtd, 4
+    movd         xm5, wtd
+    vpbroadcastw  m5, xm5
+%else
+    vpbroadcastw  m5, wtm
+    mov           hd, hm
+    psllw         m5, 4
+%endif
     DEFINE_ARGS dst, stride, t, w, h, idx
 .loop_y:
     xor         idxd, idxd
 .loop_x:
-    mova          m1, [tq+idxq*2+ 0]
-    mova          m4, [tq+idxq*2+32]
+    mova          m0, [tq+idxq*2+ 0]
+    mova          m1, [tq+idxq*2+32]
     pmovzxbw      m2, [dstq+idxq+ 0]
-    pmovzxbw      m5, [dstq+idxq+16]
-    psllw         m3, m2, 4
-    psllw         m6, m5, 4
-    psubw         m1, m3
-    psubw         m4, m6
-    pmulhrsw      m1, m0
-    pmulhrsw      m4, m0
-    paddw         m1, m2
-    paddw         m4, m5
-    packuswb      m1, m4
-    vpermq        m1, m1, q3120
-    mova [dstq+idxq], m1
+    pmovzxbw      m3, [dstq+idxq+16]
+    psllw         m4, m2, 4
+    psubw         m0, m4
+    psllw         m4, m3, 4
+    psubw         m1, m4
+    pmulhrsw      m0, m5
+    pmulhrsw      m1, m5
+    paddw         m0, m2
+    paddw         m1, m3
+    packuswb      m0, m1
+    vpermq        m0, m0, q3120
+    mova [dstq+idxq], m0
     add         idxd, 32
     cmp         idxd, wd
     jl .loop_x
+    add           tq, 384*2
     add         dstq, strideq
-    add           tq, 384 * 2
     dec           hd
     jg .loop_y
     RET
 
 INIT_YMM avx2
-cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
-    test       edged, 2                             ; have_right
+cglobal sgr_box5_h, 5, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xlim
+    mov        edged, edgem
+    movifnidn     wd, wm
+    mov           hd, hm
+    test       edgeb, 2                             ; have_right
     jz .no_right
     xor        xlimd, xlimd
-    add           wd, 2
-    add           wd, 15
+    add           wd, 2+15
     and           wd, ~15
     jmp .right_done
 .no_right:
     mov        xlimd, 3
     sub           wd, 1
 .right_done:
+    lea          r10, [pb_right_ext_mask+32]
     pxor          m1, m1
     lea         srcq, [srcq+wq+1]
     lea         sumq, [sumq+wq*2-2]
     lea       sumsqq, [sumsqq+wq*4-4]
     neg           wq
-    lea          r10, [pb_right_ext_mask+32]
 .loop_y:
     mov           xq, wq
 
     ; load left
-    test       edged, 1                             ; have_left
+    test       edgeb, 1                             ; have_left
     jz .no_left
     test       leftq, leftq
     jz .load_left_from_main
-    movd         xm0, [leftq]
-    pinsrd       xm0, [srcq+xq-1], 1
-    pslldq       xm0, 11
+    vpbroadcastd xm2, [leftq]
+    movd         xm0, [srcq+xq-1]
     add        leftq, 4
+    palignr      xm0, xm2, 1
     jmp .expand_x
 .no_left:
     vpbroadcastb xm0, [srcq+xq-1]
     jmp .expand_x
 .load_left_from_main:
-    pinsrd       xm0, [srcq+xq-4], 3
+    vpbroadcastd xm0, [srcq+xq-4]
 .expand_x:
     punpckhbw    xm0, xm1
 
@@ -727,8 +749,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
 .partial_load_and_extend:
     vpbroadcastb  m3, [srcq-1]
     pmovzxbw      m2, [srcq+xq]
-    punpcklbw     m3, m1
     movu          m4, [r10+xq*2]
+    punpcklbw     m3, m1
     pand          m2, m4
     pandn         m4, m3
     por           m2, m4
@@ -768,8 +790,8 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
     paddd         m7, m9
     paddd         m3, m5
     movu [sumq+xq*2], m0
-    movu [sumsqq+xq*4+ 0], xm7
-    movu [sumsqq+xq*4+16], xm3
+    movu         [sumsqq+xq*4+ 0], xm7
+    movu         [sumsqq+xq*4+16], xm3
     vextracti128 [sumsqq+xq*4+32], m7, 1
     vextracti128 [sumsqq+xq*4+48], m3, 1
 
@@ -788,35 +810,35 @@ cglobal sgr_box5_h, 8, 11, 10, sumsq, sum, left, src, stride, w, h, edge, x, xli
     cmp           xd, xlimd
     jl .right_extend
 
+    add         srcq, strideq
     add       sumsqq, (384+16)*4
     add         sumq, (384+16)*2
-    add         srcq, strideq
     dec hd
     jg .loop_y
     RET
 
 INIT_YMM avx2
-cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+cglobal sgr_box5_v, 4, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr, ylim
+    movifnidn  edged, edgem
     mov           xq, -2
-    mov        ylimd, edged
-    and        ylimd, 8                             ; have_bottom
-    shr        ylimd, 2
+    rorx       ylimd, edged, 2
+    and        ylimd, 2                             ; have_bottom
     sub        ylimd, 3                             ; -3 if have_bottom=0, else -1
 .loop_x:
     lea           yd, [hq+ylimq+2]
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
     lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
-    test       edged, 4                             ; have_top
+    test       edgeb, 4                             ; have_top
     jnz .load_top
     movu          m0, [sumsq_ptrq+(384+16)*4*1]
     movu          m1, [sumsq_ptrq+(384+16)*4*1+32]
+    movu         m10, [sum_ptrq+(384+16)*2*1]
     mova          m2, m0
     mova          m3, m1
     mova          m4, m0
     mova          m5, m1
     mova          m6, m0
     mova          m7, m1
-    movu         m10, [sum_ptrq+(384+16)*2*1]
     mova         m11, m10
     mova         m12, m10
     mova         m13, m10
@@ -826,10 +848,10 @@ cglobal sgr_box5_v, 5, 10, 15, sumsq, sum, w, h, edge, x, y, sumsq_ptr, sum_ptr,
     movu          m1, [sumsq_ptrq-(384+16)*4*1+32]   ; l3/4sq [right]
     movu          m4, [sumsq_ptrq-(384+16)*4*0]      ; l2sq [left]
     movu          m5, [sumsq_ptrq-(384+16)*4*0+32]   ; l2sq [right]
-    mova          m2, m0
-    mova          m3, m1
     movu         m10, [sum_ptrq-(384+16)*2*1]        ; l3/4
     movu         m12, [sum_ptrq-(384+16)*2*0]        ; l2
+    mova          m2, m0
+    mova          m3, m1
     mova         m11, m10
 .loop_y:
     movu          m6, [sumsq_ptrq+(384+16)*4*1]      ; l1sq [left]
@@ -960,8 +982,10 @@ cglobal sgr_calc_ab2, 4, 6, 11, a, b, w, h, s
     RET
 
 INIT_YMM avx2
-cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
+cglobal sgr_finish_filter2, 5, 13, 13, t, src, stride, a, b, w, h, \
                                        tmp_ptr, src_ptr, a_ptr, b_ptr, x, y
+    movifnidn     wd, wm
+    mov           hd, hm
     vpbroadcastd  m9, [pw_5_6]
     vpbroadcastd m12, [pw_256]
     psrlw        m11, m12, 1                    ; pw_128
@@ -1077,8 +1101,10 @@ cglobal sgr_finish_filter2, 7, 13, 13, t, src, stride, a, b, w, h, \
     RET
 
 INIT_YMM avx2
-cglobal sgr_weighted2, 7, 7, 11, dst, stride, t1, t2, w, h, wt
-    vpbroadcastd  m0, [wtq]
+cglobal sgr_weighted2, 4, 7, 11, dst, stride, t1, t2, w, h, wt
+    movifnidn     wd, wm
+    movifnidn     hd, hm
+    vpbroadcastd  m0, wtm
     vpbroadcastd m10, [pd_1024]
     DEFINE_ARGS dst, stride, t1, t2, w, h, idx
 .loop_y:
diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c
index a1b25a90c..b0201ce3d 100644
--- a/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration_init_tmpl.c
@@ -169,7 +169,7 @@ void dav1d_sgr_weighted1_##ext(pixel *dst, const ptrdiff_t stride, \
 void dav1d_sgr_weighted2_##ext(pixel *dst, const ptrdiff_t stride, \
                                const coef *t1, const coef *t2, \
                                const int w, const int h, \
-                               const int16_t wt[2]); \
+                               const uint32_t wt); \
 \
 static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
                              const pixel (*const left)[4], \
@@ -194,7 +194,7 @@ static void sgr_filter_##ext(pixel *const dst, const ptrdiff_t dst_stride, \
                                w, h, dav1d_sgr_params[sgr_idx][2], edges); \
         dav1d_sgr_filter1_##ext(tmp2, dst, dst_stride, left, lpf, lpf_stride, \
                                w, h, dav1d_sgr_params[sgr_idx][3], edges); \
-        const int16_t wt[2] = { sgr_wt[0], 128 - sgr_wt[0] - sgr_wt[1] }; \
+        const uint32_t wt = ((128 - sgr_wt[0] - sgr_wt[1]) << 16) | (uint16_t) sgr_wt[0]; \
         dav1d_sgr_weighted2_##ext(dst, dst_stride, tmp1, tmp2, w, h, wt); \
     } \
 }
diff --git a/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm
index df72d26ff..aaaea7835 100644
--- a/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/looprestoration_ssse3.asm
@@ -188,13 +188,13 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
  %define srcptrq    srcq
  %define dstptrq    dstq
  %define hd         dword [esp+ 0]
- %define edged      dword [esp+12]
+ %define edgeb      byte  [esp+12]
  %define xlimd      dword [esp+16]
 %endif
 
     ; if (edge & has_right) align_w_to_16
     ; else w -= 3, and use that as limit in x loop
-    test       edged, 2 ; has_right
+    test       edgeb, 2 ; has_right
     jnz .align
     mov        xlimd, -3
     jmp .loop
@@ -221,7 +221,7 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
 %endif
 
     ; load left edge pixels
-    test       edged, 1 ; have_left
+    test       edgeb, 1 ; have_left
     jz .emu_left
     test       leftq, leftq ; left == NULL for the edge-extended bottom/top
     jz .load_left_combined
@@ -359,8 +359,8 @@ cglobal wiener_filter_h, 5, 7, 8, -84, dst, left, src, stride, fh, w, h, edge
     paddw         m2, m4
     paddw         m0, m3
     paddw         m2, m5
-    paddsw        m0, m8
-    paddsw        m2, m6
+    paddsw        m0, m8 ; see the avx2 for an explanation
+    paddsw        m2, m6 ; of how the clipping works here
     psraw         m0, 3
     psraw         m2, 3
     paddw         m0, m11
@@ -477,7 +477,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
     DEFINE_ARGS dst, stride, mid, w, h, y, edge
  %define mptrq      midq
  %define dstptrq    dstq
- %define edged      dword [esp]
+ %define edgeb      byte [esp]
 %endif
 
     ; main x loop for vertical filter, does one column of 16 pixels
@@ -485,7 +485,7 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
     mova          m3, [midq] ; middle line
 
     ; load top pixels
-    test       edged, 4 ; have_top
+    test       edgeb, 4 ; have_top
     jz .emu_top
     mova          m0, [midq-384*4]
     mova          m2, [midq-384*2]
@@ -604,8 +604,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
     mova          m3, m4
     mova          m4, m5
     mova          m5, m6
-    add      dstptrq, strideq
     add        mptrq, 384*2
+    add      dstptrq, strideq
     dec           yd
     jg .loop_load
     ; for the bottom pixels, continue using m6 (as extended edge)
@@ -616,8 +616,8 @@ cglobal wiener_filter_v, 5, 7, 8, -96, dst, stride, mid, w, h, fv, edge
     mov         midq, [esp+8]
     mov         dstq, [esp+4]
 %endif
-    add         dstq, 8
     add         midq, 16
+    add         dstq, 8
     sub           wd, 8
     jg .loop_x
     RET
@@ -679,7 +679,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
  %define wq     r0m
  %define xlimd  r1m
  %define hd     hmp
- %define edged  edgemp
+ %define edgeb  byte edgem
 
     mov           r6, edgem
     and           r6, 2                             ; have_right
@@ -706,7 +706,7 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
     mov           xq, wq
 
     ; load left
-    test       edged, 1                             ; have_left
+    test       edgeb, 1                             ; have_left
     jz .no_left
     test       leftq, leftq
     jz .load_left_from_main
@@ -795,11 +795,13 @@ cglobal sgr_box3_h, 6, 7, 8, sumsq, sum, left, src, stride, x, h, edge, w, xlim
 cglobal sgr_box3_v, 4, 10, 9, sumsq, sum, w, h, edge, x, y, sumsq_base, sum_base, ylim
     movifnidn  edged, edgem
 %else
-cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
+cglobal sgr_box3_v, 3, 7, 8, -28, sumsq, sum, w, edge, h, x, y
  %define sumsq_baseq dword [esp+0]
  %define sum_baseq   dword [esp+4]
  %define ylimd       dword [esp+8]
  %define m8          [esp+12]
+    mov        edged, r4m
+    mov           hd, r3m
 %endif
     mov           xq, -2
 %if ARCH_X86_64
@@ -812,7 +814,7 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
 .loop_x:
     mov       sumsqq, sumsq_baseq
     mov         sumq, sum_baseq
-    lea           yd, [hd+ylimd+2]
+    lea           yd, [hq+ylimq+2]
 %else
     mov           yd, edged
     and           yd, 8                             ; have_bottom
@@ -824,12 +826,12 @@ cglobal sgr_box3_v, 5, 7, 8, -28, sumsq, sum, w, h, edge, x, y
 .loop_x:
     mov       sumsqd, sumsq_baseq
     mov         sumd, sum_baseq
-    lea           yd, [hd+2]
+    lea           yd, [hq+2]
     add           yd, ylimd
 %endif
     lea       sumsqq, [sumsqq+xq*4+4-(384+16)*4]
     lea         sumq, [sumq+xq*2+2-(384+16)*2]
-    test       edged, 4                             ; have_top
+    test       edgeb, 4                             ; have_top
     jnz .load_top
     movu          m0, [sumsqq+(384+16)*4*1]
     movu          m1, [sumsqq+(384+16)*4*1+16]
@@ -1180,10 +1182,10 @@ cglobal sgr_finish_filter1, 7, 7, 8, -144, t, src, stride, a, b, x, y
     psubd         m3, [aq-(384+16)*4*2+16]          ; a:ctr+bottom [second half]
 %endif
 
+    add         srcq, strideq
     add           aq, (384+16)*4
     add           bq, (384+16)*2
     add           tq, 384*2
-    add         srcq, strideq
     dec           yd
     jg .loop_y
     add           xd, 8
@@ -1237,7 +1239,7 @@ cglobal sgr_box5_h, 5, 11, 12, sumsq, sum, left, src, stride, w, h, edge, x, xli
     mova         m11, [pb_0_1]
 %else
 cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
- %define edged      edgemp
+ %define edgeb      byte edgem
  %define wd         xd
  %define wq         wd
  %define wm         r5m
@@ -1249,7 +1251,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
  %define m11    [PIC_sym(pb_0_1)]
 %endif
 
-    test       edged, 2                             ; have_right
+    test       edgeb, 2                             ; have_right
     jz .no_right
     xor        xlimd, xlimd
     add           wd, 2
@@ -1275,7 +1277,7 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
 .loop_y:
     mov           xq, wq
     ; load left
-    test       edged, 1                             ; have_left
+    test       edgeb, 1                             ; have_left
     jz .no_left
     test       leftq, leftq
     jz .load_left_from_main
@@ -1401,9 +1403,9 @@ cglobal sgr_box5_h, 7, 7, 8, sumsq, sum, left, src, xlim, x, h, edge
     cmp           xd, xlimd
     jl .right_extend
 
+    add         srcq, strideq
     add       sumsqq, (384+16)*4
     add         sumq, (384+16)*2
-    add         srcq, strideq
     dec           hd
     jg .loop_y
 %if ARCH_X86_32
@@ -1434,7 +1436,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
     lea           yd, [hd+ylimd+2]
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
     lea     sum_ptrq, [  sumq+xq*2+2-(384+16)*2]
-    test       edged, 4                             ; have_top
+    test       edgeb, 4                             ; have_top
     jnz .load_top
     movu          m0, [sumsq_ptrq+(384+16)*4*1]
     movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1520,7 +1522,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
     lea           yd, [ylimd+2]
     add           yd, hm
     lea   sumsq_ptrq, [sumsqq+xq*4+4-(384+16)*4]
-    test dword edgem, 4                             ; have_top
+    test  byte edgem, 4                             ; have_top
     jnz .sumsq_load_top
     movu          m0, [sumsq_ptrq+(384+16)*4*1]
     movu          m1, [sumsq_ptrq+(384+16)*4*1+16]
@@ -1582,7 +1584,7 @@ cglobal sgr_box5_v, 5, 7, 8, -44, sumsq, sum, x, y, ylim, sumsq_ptr, sum_ptr
     lea           yd, [ylimd+2]
     add           yd, hm
     lea     sum_ptrq, [sumq+xq*2+2-(384+16)*2]
-    test dword edgem, 4                             ; have_top
+    test  byte edgem, 4                             ; have_top
     jnz .sum_load_top
     movu          m0, [sum_ptrq+(384+16)*2*1]
     mova          m1, m0
@@ -1882,7 +1884,7 @@ cglobal sgr_finish_filter2, 6, 7, 8, t, src, stride, a, b, x, y
 
 cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
     movifnidn     wd, wm
-    mov          wtq, wtmp
+    movd          m0, wtm
 %if ARCH_X86_64
     movifnidn     hd, hm
     mova         m10, [pd_1024]
@@ -1892,7 +1894,6 @@ cglobal sgr_weighted2, 4, 7, 12, dst, stride, t1, t2, w, h, wt
  %define m10    [PIC_sym(pd_1024)]
  %define m11    m7
 %endif
-    movd          m0, [wtq]
     pshufd        m0, m0, 0
     DEFINE_ARGS dst, stride, t1, t2, w, h, idx
 %if ARCH_X86_32
diff --git a/ffmpeg/JNI/dav1d/src/x86/mc.asm b/ffmpeg/JNI/dav1d/src/x86/mc.asm
index 773546957..5d769df8d 100644
--- a/ffmpeg/JNI/dav1d/src/x86/mc.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/mc.asm
@@ -133,18 +133,39 @@ subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,
 subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
 subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+subpel_s_shuf2: db  0,  1,  2,  3,  0,  1,  2,  3,  8,  9, 10, 11,  8,  9, 10, 11
+subpel_s_shuf8: db  0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
 bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
 bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
+wswap:          db  2,  3,  0,  1,  6,  7,  4,  5, 10, 11,  8,  9, 14, 15, 12, 13
+pb_8x0_8x8: times 8 db 0
+            times 8 db 8
+bdct_lb_dw: times 4 db 0
+            times 4 db 4
+            times 4 db 8
+            times 4 db 12
 
+ALIGN 32
+rescale_mul:    dd  0,  1,  2,  3, 4, 5, 6, 7
+resize_shuf:    times 5 db 0
+                db  1, 2, 3, 4, 5, 6
+                times 5+8 db 7
+
+ALIGN 8
 wm_420_perm64:  dq 0xfedcba9876543210
 wm_420_sign:    dd 0x01020102, 0x01010101
 wm_422_sign:    dd 0x80808080, 0x7f7f7f7f
 wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
 
+ALIGN 4
+pb_0123: db 0, 1, 2, 3
+pb_4567: db 4, 5, 6, 7
 pw_m128  times 2 dw -128
+pw_m256: times 2 dw -256
+pw_32:   times 2 dw 32
 pw_34:   times 2 dw 34
 pw_258:  times 2 dw 258
 pw_512:  times 2 dw 512
@@ -152,10 +173,14 @@ pw_1024: times 2 dw 1024
 pw_2048: times 2 dw 2048
 pw_6903: times 2 dw 6903
 pw_8192: times 2 dw 8192
-pd_2:     dd 2
-pd_32:    dd 32
-pd_512:   dd 512
-pd_32768: dd 32768
+pd_2:            dd 2
+pd_32:           dd 32
+pd_63:           dd 63
+pd_512:          dd 512
+pd_32768:        dd 32768
+pd_0x3ff:        dd 0x3ff
+pd_0x4000:       dd 0x4000
+pq_0x40000000:   dq 0x40000000
 
 %define pb_m64 (wm_sign_avx512+4)
 %define pb_64  (wm_sign_avx512+8)
@@ -218,28 +243,55 @@ cextern mc_warp_filter
     %endrep
 %endmacro
 
+%macro SCALED_JMP_TABLE 1-*
+    %xdefine %1_table (%%table - %2)
+    %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+    %rep %0 - 1
+        dw %%base %+ .w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_1024:
+    %xdefine %1_dy1_table (%%dy_1024 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy1_w%2 - %%base
+        %rotate 1
+    %endrep
+    %rotate 1
+%%dy_2048:
+    %xdefine %1_dy2_table (%%dy_2048 - %2)
+    %rep %0 - 1
+        dw %%base %+ .dy2_w%2 - %%base
+        %rotate 1
+    %endrep
+%endmacro
+
 %xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
 %xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
 %xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
 
 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
 
-BASE_JMP_TABLE put,  avx2,         2, 4, 8, 16, 32, 64, 128
-BASE_JMP_TABLE prep, avx2,            4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE avg_avx2,             4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_avg_avx2,           4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE mask_avx2,            4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_420_avx2,      4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_422_avx2,      4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_444_avx2,      4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE blend_avx2,           4, 8, 16, 32
-BIDIR_JMP_TABLE blend_v_avx2,      2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_h_avx2,      2, 4, 8, 16, 32, 32, 32
+BASE_JMP_TABLE   put,  avx2,           2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE   prep, avx2,              4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     put,  bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     prep, bilin, avx2, 7,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE     prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2,   4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  avg_avx2,                4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_avg_avx2,              4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  mask_avx2,               4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_420_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_422_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  w_mask_444_avx2,         4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE  blend_avx2,              4, 8, 16, 32
+BIDIR_JMP_TABLE  blend_v_avx2,         2, 4, 8, 16, 32
+BIDIR_JMP_TABLE  blend_h_avx2,         2, 4, 8, 16, 32, 32, 32
 
+%if HAVE_AVX512ICL
 BASE_JMP_TABLE prep, avx512icl,            4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, 8tap,  avx512icl, 7,    4, 8, 16, 32, 64, 128
@@ -249,6 +301,7 @@ BIDIR_JMP_TABLE mask_avx512icl,            4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_420_avx512icl,      4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_422_avx512icl,      4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE w_mask_444_avx512icl,      4, 8, 16, 32, 64, 128
+%endif ; HAVE_AVX512ICL
 
 SECTION .text
 
@@ -1929,19 +1982,22 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
 %assign FILTER_SHARP   (2*15 << 16) | 3*15
 
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2
+    mov                 t0d, FILTER_%3
+    mov                 t1d, FILTER_%4
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
+%endif
+%endmacro
+
 %if WIN64
 DECLARE_REG_TMP 4, 5
 %else
 DECLARE_REG_TMP 7, 8
 %endif
-%macro PUT_8TAP_FN 3 ; type, type_h, type_v
-cglobal put_8tap_%1
-    mov                 t0d, FILTER_%2
-    mov                 t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _put_8tap %+ SUFFIX)
-%endif
-%endmacro
+
+%define PUT_8TAP_FN FN put_8tap,
 
 PUT_8TAP_FN regular,        REGULAR, REGULAR
 PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
@@ -3859,6 +3915,1853 @@ cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
     RET
 %endmacro
 
+%macro movifprep 2
+ %if isprep
+    mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1  r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+  %xdefine r14_save r14
+  %assign %%i 14
+  %rep 14
+   %assign %%j %%i-1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i-1
+  %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+  %assign %%i 1
+  %rep 13
+   %assign %%j %%i+1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i+1
+  %endrep
+  %xdefine r14 r14_save
+  %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+    RET
+ %if %1
+    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+    movq               xm%1, [srcq+ r4]
+    movq               xm%2, [srcq+ r6]
+    movhps             xm%1, [srcq+ r7]
+    movhps             xm%2, [srcq+ r9]
+    vinserti128         m%1, [srcq+r10], 1
+    vinserti128         m%2, [srcq+r11], 1
+    vpbroadcastq        m%5, [srcq+r13]
+    vpbroadcastq        m%6, [srcq+ rX]
+    add                srcq, ssq
+    movq               xm%3, [srcq+ r4]
+    movq               xm%4, [srcq+ r6]
+    movhps             xm%3, [srcq+ r7]
+    movhps             xm%4, [srcq+ r9]
+    vinserti128         m%3, [srcq+r10], 1
+    vinserti128         m%4, [srcq+r11], 1
+    vpbroadcastq        m%7, [srcq+r13]
+    vpbroadcastq        m%8, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd            m%1, m%5, 0xc0
+    vpblendd            m%2, m%6, 0xc0
+    vpblendd            m%3, m%7, 0xc0
+    vpblendd            m%4, m%8, 0xc0
+    pmaddubsw           m%1, m15
+    pmaddubsw           m%2, m10
+    pmaddubsw           m%3, m15
+    pmaddubsw           m%4, m10
+    phaddw              m%1, m%2
+    phaddw              m%3, m%4
+    phaddw              m%1, m%3
+    pmulhrsw            m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+    lea            base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
+    tzcnt                wd, wm
+    vpbroadcastd         m8, dxm
+%if isprep && UNIX64
+    movd               xm14, mxd
+    vpbroadcastd        m14, xm14
+    mov                 r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+    vpbroadcastd        m14, mxm
+%endif
+    mov                 dyd, dym
+%ifidn %1, put
+ %if WIN64
+    mov                 r8d, hm
+  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r5m
+  %define dxm r8m
+ %else
+  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+  %define dsm [rsp+96]
+  %define rX r1
+  %define rXd r1d
+ %else
+  %define dsm dsq
+  %define rX r14
+  %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+    mov                 r7d, hm
+  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r4m
+  %define dxm r7m
+ %else
+  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    vpbroadcastd        m12, [base+pw_8192]
+%ifidn %1, put
+    vpbroadcastd        m13, [base+pd_512]
+%else
+    vpbroadcastd        m13, [base+pd_32]
+%endif
+    pxor                 m9, m9
+    lea                ss3q, [ssq*3]
+    movzx               r7d, t1b
+    shr                 t1d, 16
+    cmp                  hd, 6
+    cmovs               t1d, r7d
+    sub                srcq, ss3q
+    cmp                 dyd, 1024
+    je .dy1
+    cmp                 dyd, 2048
+    je .dy2
+    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*2]
+    movhps              xm0, [srcq+ssq*1]
+    movhps              xm1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m0, [srcq+ssq*0], 1
+    vinserti128          m1, [srcq+ssq*2], 1
+    vpbroadcastq         m2, [srcq+ssq*1]
+    vpbroadcastq         m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    vpblendd            m15, m7, 0xaa
+    vpblendd             m0, m2, 0xc0       ; 0 1  4 5
+    vpblendd             m1, m3, 0xc0       ; 2 3  6 7
+    pblendvb            m15, m11, m8
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    phaddw               m0, m1
+    pmulhrsw             m0, m12            ; 0 1 2 3  4 5 6 7
+    vextracti128        xm1, m0, 1          ; 4 5 6 7
+    palignr             xm2, xm1, xm0, 4    ; 1 2 3 4
+    punpcklwd           xm3, xm0, xm2       ; 01 12
+    punpckhwd           xm0, xm2            ; 23 34
+    pshufd              xm4, xm1, q0321     ; 5 6 7 _
+    punpcklwd           xm2, xm1, xm4       ; 45 56
+    punpckhwd           xm4, xm1, xm4       ; 67 __
+.w2_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    pshufd              xm8, xm11, q0000
+    pshufd              xm9, xm11, q1111
+    pshufd             xm10, xm11, q2222
+    pshufd             xm11, xm11, q3333
+    pmaddwd             xm5, xm3, xm8
+    pmaddwd             xm6, xm0, xm9
+    pmaddwd             xm7, xm2, xm10
+    pmaddwd             xm8, xm4, xm11
+    paddd               xm5, xm6
+    paddd               xm7, xm8
+    paddd               xm5, xm13
+    paddd               xm5, xm7
+    psrad               xm5, 10
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw           [dstq], xm5, 0
+    add                dstq, dsq
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w2_loop
+    movq                xm5, [srcq]
+    test                myd, 0x400
+    jz .w2_skip_line
+    add                srcq, ssq
+    shufps              xm3, xm0, q1032     ; 01 12
+    shufps              xm0, xm2, q1032     ; 23 34
+    shufps              xm2, xm4, q1032     ; 45 56
+    pshufb              xm5, xm14
+    pmaddubsw           xm5, xm15
+    phaddw              xm5, xm5
+    pmulhrsw            xm5, xm12
+    palignr             xm1, xm5, xm1, 12
+    punpcklqdq          xm1, xm1            ; 6 7 6 7
+    punpcklwd           xm4, xm1, xm5       ; 67 __
+    jmp .w2_loop
+.w2_skip_line:
+    movhps              xm5, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    mova                xm3, xm0            ; 01 12
+    mova                xm0, xm2            ; 23 34
+    pshufb              xm5, xm14
+    pmaddubsw           xm5, xm15
+    phaddw              xm5, xm5
+    pmulhrsw            xm5, xm12           ; 6 7 6 7
+    palignr             xm1, xm5, xm1, 8    ; 4 5 6 7
+    pshufd              xm5, xm1, q0321     ; 5 6 7 _
+    punpcklwd           xm2, xm1, xm5       ; 45 56
+    punpckhwd           xm4, xm1, xm5       ; 67 __
+    jmp .w2_loop
+%endif
+.w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m0, m14, m10
+    psrld                m0, 6
+    paddd              xm15, xm0
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    pextrd             r11d, xm15, 2
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
+    pcmpeqd              m0, m9
+    psrld               m14, 10
+    movu                xm7, [srcq+ssq*0]
+    movu                xm9, [srcq+ssq*1]
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
+    movu                xm8, [srcq+ssq*2]
+    movu               xm10, [srcq+ss3q ]
+    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m7, [srcq+ssq*0], 1
+    vinserti128          m9, [srcq+ssq*1], 1
+    vinserti128         m15, xm15, 1
+    vinserti128          m8, [srcq+ssq*2], 1
+    vinserti128         m10, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    pblendvb            m15, m11, m0
+    pshufb               m7, m14
+    pshufb               m9, m14
+    pshufb               m8, m14
+    pshufb              m10, m14
+    pmaddubsw            m7, m15
+    pmaddubsw            m9, m15
+    pmaddubsw            m8, m15
+    pmaddubsw           m10, m15
+    phaddw               m7, m9
+    phaddw               m8, m10
+    pmulhrsw             m7, m12                ; 0 1  4 5
+    pmulhrsw             m8, m12                ; 2 3  6 7
+    vextracti128        xm9, m7, 1              ; 4 5
+    vextracti128        xm3, m8, 1              ; 6 7
+    shufps              xm4, xm7, xm8, q1032    ; 1 2
+    shufps              xm5, xm8, xm9, q1032    ; 3 4
+    shufps              xm6, xm9, xm3, q1032    ; 5 6
+    psrldq             xm11, xm3, 8             ; 7 _
+    punpcklwd           xm0, xm7, xm4   ; 01
+    punpckhwd           xm7, xm4        ; 12
+    punpcklwd           xm1, xm8, xm5   ; 23
+    punpckhwd           xm8, xm5        ; 34
+    punpcklwd           xm2, xm9, xm6   ; 45
+    punpckhwd           xm9, xm6        ; 56
+    punpcklwd           xm3, xm11       ; 67
+    mova         [rsp+0x00], xm7
+    mova         [rsp+0x10], xm8
+    mova         [rsp+0x20], xm9
+.w4_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm10, r6q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    pshufd              xm7, xm10, q0000
+    pshufd              xm8, xm10, q1111
+    pshufd              xm9, xm10, q2222
+    pshufd             xm10, xm10, q3333
+    pmaddwd             xm4, xm0, xm7
+    pmaddwd             xm5, xm1, xm8
+    pmaddwd             xm6, xm2, xm9
+    pmaddwd             xm7, xm3, xm10
+    paddd               xm4, xm5
+    paddd               xm6, xm7
+    paddd               xm4, xm13
+    paddd               xm4, xm6
+    psrad               xm4, rndshift
+    packssdw            xm4, xm4
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movd             [dstq], xm4
+    add                dstq, dsq
+%else
+    movq             [tmpq], xm4
+    add                tmpq, 8
+%endif
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w4_loop
+    movu                xm4, [srcq]
+    test                myd, 0x400
+    jz .w4_skip_line
+    mova                xm0, [rsp+0x00]
+    mova         [rsp+0x00], xm1
+    mova                xm1, [rsp+0x10]
+    mova         [rsp+0x10], xm2
+    mova                xm2, [rsp+0x20]
+    mova         [rsp+0x20], xm3
+    pshufb              xm4, xm14
+    pmaddubsw           xm4, xm15
+    phaddw              xm4, xm4
+    pmulhrsw            xm4, xm12
+    punpcklwd           xm3, xm11, xm4
+    mova               xm11, xm4
+    add                srcq, ssq
+    jmp .w4_loop
+.w4_skip_line:
+    movu                xm5, [srcq+ssq*1]
+    movu                 m6, [rsp+0x10]
+    pshufb              xm4, xm14
+    pshufb              xm5, xm14
+    pmaddubsw           xm4, xm15
+    pmaddubsw           xm5, xm15
+    movu         [rsp+0x00], m6
+    phaddw              xm4, xm5
+    pmulhrsw            xm4, xm12
+    punpcklwd           xm9, xm11, xm4
+    mova         [rsp+0x20], xm9
+    psrldq             xm11, xm4, 8
+    mova                xm0, xm1
+    mova                xm1, xm2
+    mova                xm2, xm3
+    punpcklwd           xm3, xm4, xm11
+    lea                srcq, [srcq+ssq*2]
+    jmp .w4_loop
+.w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+r4*8]
+    movq               xm10, [base+subpel_filters+r6*8]
+    movhps             xm15, [base+subpel_filters+r7*8]
+    movhps             xm10, [base+subpel_filters+r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    psrld               m14, 10
+    mova              [rsp], xm14
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    mov                 dyd, dym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    vbroadcasti128      m14, [base+wswap]
+.w8_loop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufd               m8, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m6, m2, m8
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .w8_loop
+    test                myd, 0x400
+    mov            [rsp+16], myd
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    jz .w8_skip_line
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    add                srcq, ssq
+    mov                 myd, [rsp+16]
+    mov                 dyd, dym
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .w8_loop
+.w8_skip_line:
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    vpbroadcastq         m7, [srcq+r13]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    add                srcq, ssq
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
+    vinserti128          m5, [srcq+r10], 1
+    vinserti128          m6, [srcq+r11], 1
+    vpbroadcastq         m9, [srcq+r13]
+    vpbroadcastq        m11, [srcq+ rX]
+    add                srcq, ssq
+    mov                 myd, [rsp+16]
+    mov                 dyd, dym
+    vpblendd             m3, m7, 0xc0
+    vpblendd             m4, m8, 0xc0
+    vpblendd             m5, m9, 0xc0
+    vpblendd             m6, m11, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    pmaddubsw            m5, m15
+    pmaddubsw            m6, m10
+    phaddw               m3, m4
+    phaddw               m5, m6
+    psrld                m4, m3, 16
+    pslld                m6, m5, 16
+    paddw                m3, m4
+    paddw                m5, m6
+    pblendw              m3, m5, 0xaa
+    pmulhrsw             m3, m12
+    jmp .w8_loop
+.w16:
+    mov      dword [rsp+48], 2
+    movifprep   tmp_stridem, 32
+    jmp .w_start
+.w32:
+    mov      dword [rsp+48], 4
+    movifprep   tmp_stridem, 64
+    jmp .w_start
+.w64:
+    mov      dword [rsp+48], 8
+    movifprep   tmp_stridem, 128
+    jmp .w_start
+.w128:
+    mov      dword [rsp+48], 16
+    movifprep   tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+72], t0d
+    mov            [rsp+56], srcq
+    mov            [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .hloop
+.hloop_prep:
+    dec      dword [rsp+48]
+    jz .ret
+    add      qword [rsp+64], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp+16]
+    vpbroadcastd        m15, [rsp+72]
+    pxor                 m9, m9
+    mov                srcq, [rsp+56]
+    mov                 r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu           [rsp+16], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    mova              [rsp], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    mov                 dyd, dym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    vbroadcasti128      m14, [base+wswap]
+.vloop:
+    and                 myd, 0x3ff
+    mov                 r6d, 64 << 24
+    mov                 r4d, myd
+    shr                 r4d, 6
+    lea                 r4d, [t1+r4]
+    cmovnz              r6q, [base+subpel_filters+r4*8]
+    movq               xm11, r6q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufd               m8, m11, q2222
+    pshufd              m11, m11, q3333
+    pmaddwd              m6, m2, m8
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .hloop_prep
+    add                 myd, dyd
+    test                myd, ~0x3ff
+    jz .vloop
+    test                myd, 0x400
+    mov            [rsp+52], myd
+    mov                 r4d, [rsp+ 0]
+    mov                 r6d, [rsp+ 8]
+    mov                 r7d, [rsp+ 4]
+    mov                 r9d, [rsp+12]
+    jz .skip_line
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    add                srcq, ssq
+    mov                 myd, [rsp+52]
+    mov                 dyd, dym
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .vloop
+.skip_line:
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    vpbroadcastq         m7, [srcq+r13]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    add                srcq, ssq
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
+    vinserti128          m5, [srcq+r10], 1
+    vinserti128          m6, [srcq+r11], 1
+    vpbroadcastq         m9, [srcq+r13]
+    vpbroadcastq        m11, [srcq+ rX]
+    add                srcq, ssq
+    mov                 myd, [rsp+52]
+    mov                 dyd, dym
+    vpblendd             m3, m7, 0xc0
+    vpblendd             m4, m8, 0xc0
+    vpblendd             m5, m9, 0xc0
+    vpblendd             m6, m11, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    pmaddubsw            m5, m15
+    pmaddubsw            m6, m10
+    phaddw               m3, m4
+    phaddw               m5, m6
+    psrld                m4, m3, 16
+    pslld                m6, m5, 16
+    paddw                m3, m4
+    paddw                m5, m6
+    pblendw              m3, m5, 0xaa
+    pmulhrsw             m3, m12
+    jmp .vloop
+.dy1:
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy1_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    movq                xm1, [srcq+ssq*2]
+    movhps              xm0, [srcq+ssq*1]
+    movhps              xm1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m0, [srcq+ssq*0], 1
+    vinserti128          m1, [srcq+ssq*2], 1
+    vpbroadcastq         m2, [srcq+ssq*1]
+    add                srcq, ss3q
+    movq               xm10, r4q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    vpblendd            m15, m7, 0xaa
+    pblendvb            m15, m11, m8
+    pshufd              xm8, xm10, q0000
+    pshufd              xm9, xm10, q1111
+    pshufd             xm11, xm10, q3333
+    pshufd             xm10, xm10, q2222
+    vpblendd             m0, m2, 0xc0
+    pshufb               m1, m14
+    pshufb               m0, m14
+    pmaddubsw            m1, m15
+    pmaddubsw            m0, m15
+    phaddw               m0, m1
+    pmulhrsw             m0, m12
+    vextracti128        xm1, m0, 1
+    palignr             xm2, xm1, xm0, 4
+    pshufd              xm4, xm1, q2121
+    punpcklwd           xm3, xm0, xm2       ; 01 12
+    punpckhwd           xm0, xm2            ; 23 34
+    punpcklwd           xm2, xm1, xm4       ; 45 56
+.dy1_w2_loop:
+    movq                xm1, [srcq+ssq*0]
+    movhps              xm1, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd             xm5, xm3, xm8
+    pmaddwd             xm6, xm0, xm9
+    pmaddwd             xm7, xm2, xm10
+    mova                xm3, xm0
+    mova                xm0, xm2
+    paddd               xm5, xm13
+    paddd               xm6, xm7
+    pshufb              xm1, xm14
+    pmaddubsw           xm1, xm15
+    phaddw              xm1, xm1
+    pmulhrsw            xm1, xm12
+    palignr             xm7, xm1, xm4, 12
+    punpcklwd           xm2, xm7, xm1     ; 67 78
+    pmaddwd             xm7, xm2, xm11
+    mova                xm4, xm1
+    paddd               xm5, xm6
+    paddd               xm5, xm7
+    psrad               xm5, rndshift
+    packssdw            xm5, xm5
+    packuswb            xm5, xm5
+    pextrw     [dstq+dsq*0], xm5, 0
+    pextrw     [dstq+dsq*1], xm5, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy1_w2_loop
+    RET
+%endif
+.dy1_w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    vpermq               m8, m8, q3120
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd             r11d, xm15, 1
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    movu                xm2, [srcq+ssq*0]
+    movu                xm3, [srcq+ssq*2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 1
+    vpblendd             m7, [base+subpel_filters+r13*8+2-20], 0x20
+    vinserti128          m2, [srcq+ssq*1], 1
+    vinserti128          m3, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    pshufb              m14, m5
+    paddb               m14, m6
+    movu                xm4, [srcq+ssq*0]
+    movu                xm5, [srcq+ssq*2]
+    vinserti128          m4, [srcq+ssq*1], 1
+    add                srcq, ss3q
+    vpblendd            m15, m7, 0x30
+    punpcklqdq          m15, m15
+    pblendvb            m15, m11, m8
+    movq               xm10, r4q
+    punpcklbw          xm10, xm10
+    psraw              xm10, 8
+    vinserti128         m10, xm10, 1
+    pshufb               m2, m14
+    pshufb               m3, m14
+    pshufb               m4, m14
+    pshufb              xm5, xm14
+    vpermq               m2, m2, q3120
+    vpermq               m3, m3, q3120
+    vpermq               m4, m4, q3120
+    vpermq               m5, m5, q3120
+    pshufd               m7, m10, q0000
+    pshufd               m8, m10, q1111
+    pshufd               m9, m10, q2222
+    pshufd              m10, m10, q3333
+    pmaddubsw            m2, m15
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m15
+    phaddw               m2, m3
+    phaddw               m4, m5
+    pmulhrsw             m2, m12
+    pmulhrsw             m4, m12
+    palignr              m5, m4, m2, 4
+    pshufd               m3, m4, q2121
+    punpcklwd            m0, m2, m5     ; 01 12
+    punpckhwd            m1, m2, m5     ; 23 34
+    punpcklwd            m2, m4, m3     ; 45 56
+.dy1_w4_loop:
+    movu               xm11, [srcq+ssq*0]
+    vinserti128         m11, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pmaddwd              m4, m0, m7
+    pmaddwd              m5, m1, m8
+    pmaddwd              m6, m2, m9
+    mova                 m0, m1
+    mova                 m1, m2
+    paddd                m4, m13
+    paddd                m5, m6
+    pshufb              m11, m14
+    vpermq              m11, m11, q3120
+    pmaddubsw           m11, m15
+    phaddw              m11, m11
+    pmulhrsw            m11, m12
+    palignr              m6, m11, m3, 12
+    punpcklwd            m2, m6, m11    ; 67 78
+    mova                 m3, m11
+    pmaddwd              m6, m2, m10
+    paddd                m4, m5
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    pshuflw             xm4, xm4, q3120
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+%else
+    pshufd              xm4, xm4, q3120
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    sub                  hd, 2
+    jg .dy1_w4_loop
+    MC_8TAP_SCALED_RET
+.dy1_w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    mov            [rsp+32], r7d
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    movu              [rsp], m10
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    lea                 myd, [t1+myq]
+    mov                 t1d, 64 << 24
+    cmovnz              t1q, [base+subpel_filters+myq*8]
+    vbroadcasti128      m14, [base+wswap]
+    movq               xm11, t1q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r7d, [rsp+32]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+.dy1_w8_loop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m10
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, [rsp]
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .dy1_w8_loop
+.dy1_w16:
+    mov      dword [rsp+72], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy1_w_start
+.dy1_w32:
+    mov      dword [rsp+72], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy1_w_start
+.dy1_w64:
+    mov      dword [rsp+72], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy1_w_start
+.dy1_w128:
+    mov      dword [rsp+72], 16
+    movifprep   tmp_stridem, 256
+.dy1_w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+76], t0d
+    mov            [rsp+80], srcq
+    mov            [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .dy1_hloop
+.dy1_hloop_prep:
+    dec      dword [rsp+72]
+    jz .ret
+    add      qword [rsp+88], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp+32]
+    vpbroadcastd        m15, [rsp+76]
+    pxor                 m9, m9
+    mov                srcq, [rsp+80]
+    mov                 r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu           [rsp+32], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movq           [rsp+64], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    movu              [rsp], m10
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vbroadcasti128      m14, [base+wswap]
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r4d, [rsp+64]
+    mov                 r7d, [rsp+68]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+.dy1_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m10
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy1_hloop_prep
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pshufb               m2, m14
+    pshufb               m3, m14
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, [rsp]
+    phaddw               m4, m5
+    pslld                m5, m4, 16
+    paddw                m4, m5
+    pmulhrsw             m4, m12
+    pblendw              m0, m1, 0xaa
+    pblendw              m1, m2, 0xaa
+    pblendw              m2, m3, 0xaa
+    pblendw              m3, m4, 0xaa
+    jmp .dy1_vloop
+.dy2:
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+    add                  wq, base_reg
+    jmp                  wq
+%ifidn %1, put
+.dy2_w2:
+    mov                 myd, mym
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    punpckldq            m8, m9, m8
+    paddd               m14, m8 ; mx+dx*[0-1]
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vbroadcasti128       m6, [base+subpel_s_shuf2]
+    vpbroadcastd        m15, [base+subpel_filters+r4*8+2]
+    vpbroadcastd         m7, [base+subpel_filters+r6*8+2]
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movq                xm0, [srcq+ssq*0]
+    vpbroadcastq         m2, [srcq+ssq*1]
+    movhps              xm0, [srcq+ssq*2]
+    vpbroadcastq         m3, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pshufb              m14, m5
+    paddb               m14, m6
+    vpblendd            m15, m7, 0xaa
+    pblendvb            m15, m11, m8
+    movhps              xm1, [srcq+ssq*0]
+    vpbroadcastq         m4, [srcq+ssq*1]
+    lea                srcq, [srcq+ssq*2]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vpblendd             m0, m2, 0x30
+    vpblendd             m1, m4, 0xc0
+    vpblendd             m0, m3, 0xc0
+    pshufb               m0, m14
+    pshufb               m1, m14
+    pmaddubsw            m0, m15
+    pmaddubsw            m1, m15
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    phaddw               m0, m1
+    pmulhrsw             m0, m12            ; 0 2 _ 4  1 3 _ 5
+    pshufd              xm8, xm11, q0000
+    pshufd              xm9, xm11, q1111
+    pshufd             xm10, xm11, q2222
+    pshufd             xm11, xm11, q3333
+    pshufd               m2, m0, q3110      ; 0 2 2 4  1 3 3 5
+    vextracti128        xm1, m2, 1
+    punpcklwd           xm3, xm2, xm1       ; 01 23
+    punpckhwd           xm2, xm1            ; 23 45
+.dy2_w2_loop:
+    movq                xm6, [srcq+ssq*0]
+    vpbroadcastq         m7, [srcq+ssq*1]
+    movhps              xm6, [srcq+ssq*2]
+    vpbroadcastq         m1, [srcq+ss3q ]
+    lea                srcq, [srcq+ssq*4]
+    pmaddwd             xm4, xm3, xm8
+    pmaddwd             xm5, xm2, xm9
+    vpblendd             m6, m7, 0x30
+    vpblendd             m6, m1, 0xc0
+    pshufb               m6, m14
+    pmaddubsw            m6, m15
+    phaddw               m6, m6
+    pmulhrsw             m6, m12
+    palignr              m0, m6, m0, 8
+    pshufd               m2, m0, q3221
+    vextracti128        xm1, m2, 1
+    punpcklwd           xm3, xm2, xm1       ; 45 67
+    punpckhwd           xm2, xm1            ; 67 89
+    pmaddwd             xm6, xm3, xm10
+    pmaddwd             xm7, xm2, xm11
+    paddd               xm4, xm5
+    paddd               xm4, xm13
+    paddd               xm6, xm7
+    paddd               xm4, xm6
+    psrad               xm4, rndshift
+    packssdw            xm4, xm4
+    packuswb            xm4, xm4
+    pextrw     [dstq+dsq*0], xm4, 0
+    pextrw     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+    sub                  hd, 2
+    jg .dy2_w2_loop
+    RET
+%endif
+.dy2_w4:
+    mov                 myd, mym
+    vbroadcasti128       m7, [base+rescale_mul]
+    movzx               t0d, t0b
+    dec                srcq
+    movd               xm15, t0d
+    pmaddwd              m8, m7
+    vpbroadcastd        m11, [base+pd_0x4000]
+    vpbroadcastd       xm15, xm15
+    paddd               m14, m8 ; mx+dx*[0-3]
+    pand                 m8, m14, m10
+    psrld                m8, 6
+    paddd              xm15, xm8
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 1
+    pextrd             r11d, xm15, 2
+    pextrd             r13d, xm15, 3
+    movd               xm15, [base+subpel_filters+r4*8+2]
+    vbroadcasti128       m5, [base+bdct_lb_dw]
+    vpbroadcastq         m6, [base+subpel_s_shuf2]
+    pinsrd             xm15, [base+subpel_filters+r6*8+2], 1
+    pcmpeqd              m8, m9
+    psrld               m14, 10
+    movu                xm0, [srcq+ssq*0]
+    movu                xm2, [srcq+ssq*2]
+    pinsrd             xm15, [base+subpel_filters+r11*8+2], 2
+    movu                xm1, [srcq+ssq*1]
+    movu                xm3, [srcq+ss3q ]
+    pinsrd             xm15, [base+subpel_filters+r13*8+2], 3
+    lea                srcq, [srcq+ssq*4]
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    vinserti128         m15, xm15, 1
+    pshufb              m14, m5
+    paddb               m14, m6
+    vinserti128          m2, [srcq+ssq*0], 1
+    vinserti128          m3, [srcq+ssq*1], 1
+    lea                srcq, [srcq+ssq*2]
+    pblendvb            m15, m11, m8
+    pshufb              xm0, xm14
+    pshufb               m2, m14
+    pshufb              xm1, xm14
+    pshufb               m3, m14
+    pmaddubsw           xm0, xm15
+    pmaddubsw            m2, m15
+    pmaddubsw           xm1, xm15
+    pmaddubsw            m3, m15
+    movq               xm11, r4q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    phaddw               m0, m2
+    phaddw               m1, m3
+    pmulhrsw             m0, m12    ; 0 2  _ 4
+    pmulhrsw             m1, m12    ; 1 3  _ 5
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m10, m11, q2222
+    pshufd              m11, m11, q3333
+    punpcklwd           xm2, xm0, xm1
+    punpckhwd            m1, m0, m1     ; 23 45
+    vinserti128          m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+    movu                xm6, [srcq+ssq*0]
+    movu                xm7, [srcq+ssq*1]
+    vinserti128          m6, [srcq+ssq*2], 1
+    vinserti128          m7, [srcq+ss3q ], 1
+    lea                srcq, [srcq+ssq*4]
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pshufb               m6, m14
+    pshufb               m7, m14
+    pmaddubsw            m6, m15
+    pmaddubsw            m7, m15
+    psrld                m2, m6, 16
+    pslld                m3, m7, 16
+    paddw                m6, m2
+    paddw                m7, m3
+    pblendw              m6, m7, 0xaa   ; 67 89
+    pmulhrsw             m6, m12
+    paddd                m4, m5
+    vpblendd             m0, m1, m6, 0x0f
+    mova                 m1, m6
+    vpermq               m0, m0, q1032  ; 45 67
+    pmaddwd              m6, m0, m10
+    pmaddwd              m7, m1, m11
+    paddd                m4, m13
+    paddd                m6, m7
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movd       [dstq+dsq*0], xm4
+    pextrd     [dstq+dsq*1], xm4, 1
+    lea                dstq, [dstq+dsq*2]
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    sub                  hd, 2
+    jg .dy2_w4_loop
+    MC_8TAP_SCALED_RET
+.dy2_w8:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    movd               xm15, t0d
+    pmaddwd              m8, [base+rescale_mul]
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    mov               [rsp], r7d
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    lea                 myd, [t1+myq]
+    mov                 t1d, 64 << 24
+    cmovnz              t1q, [base+subpel_filters+myq*8]
+    movq               xm11, t1q
+    punpcklbw          xm11, xm11
+    psraw              xm11, 8
+    vinserti128         m11, xm11, 1
+    mov                 r7d, [rsp]
+    pshufd               m8, m11, q0000
+    pshufd               m9, m11, q1111
+    pshufd              m14, m11, q2222
+    pshufd              m11, m11, q3333
+.dy2_w8_loop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m14
+    pmaddwd              m7, m3, m11
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
+    dec                  hd
+    jz .ret
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    vpbroadcastq         m5, [srcq+r13]
+    vpbroadcastq         m6, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m3, m5, 0xc0
+    vpblendd             m4, m6, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    phaddw               m3, m4
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    psrld                m5, m3, 16
+    pslld                m6, m4, 16
+    paddw                m3, m5
+    paddw                m4, m6
+    pblendw              m3, m4, 0xaa
+    pmulhrsw             m3, m12
+    jmp .dy2_w8_loop
+.dy2_w16:
+    mov      dword [rsp+40], 2
+    movifprep   tmp_stridem, 32
+    jmp .dy2_w_start
+.dy2_w32:
+    mov      dword [rsp+40], 4
+    movifprep   tmp_stridem, 64
+    jmp .dy2_w_start
+.dy2_w64:
+    mov      dword [rsp+40], 8
+    movifprep   tmp_stridem, 128
+    jmp .dy2_w_start
+.dy2_w128:
+    mov      dword [rsp+40], 16
+    movifprep   tmp_stridem, 256
+.dy2_w_start:
+%ifidn %1, put
+    movifnidn           dsm, dsq
+%endif
+    shr                 t0d, 16
+    sub                srcq, 3
+    pmaddwd              m8, [base+rescale_mul]
+    movd               xm15, t0d
+    mov            [rsp+64], t0d
+    mov            [rsp+48], srcq
+    mov            [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+    mov                  hm, hd
+%endif
+    shl           dword dxm, 3 ; dx*8
+    vpbroadcastd        m15, xm15
+    paddd               m14, m8 ; mx+dx*[0-7]
+    jmp .dy2_hloop
+.dy2_hloop_prep:
+    dec      dword [rsp+40]
+    jz .ret
+    add      qword [rsp+56], 8*(isprep+1)
+    mov                  hd, hm
+    vpbroadcastd         m8, dxm
+    vpbroadcastd        m10, [base+pd_0x3ff]
+    paddd               m14, m8, [rsp]
+    vpbroadcastd        m15, [rsp+64]
+    pxor                 m9, m9
+    mov                srcq, [rsp+48]
+    mov                 r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+    vpbroadcastq        m11, [base+pq_0x40000000]
+    pand                 m6, m14, m10
+    psrld                m6, 6
+    paddd               m15, m6
+    pcmpeqd              m6, m9
+    vextracti128        xm7, m15, 1
+    movd                r4d, xm15
+    pextrd              r6d, xm15, 2
+    pextrd              r7d, xm15, 1
+    pextrd              r9d, xm15, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    movu              [rsp], m14
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
+    vinserti128         m15, [base+subpel_filters+r10*8], 1
+    vinserti128         m10, [base+subpel_filters+r11*8], 1
+    vpbroadcastq         m9, [base+subpel_filters+r13*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
+    psrld               m14, 10
+    vextracti128        xm7, m14, 1
+    movq           [rsp+32], xm14
+    movd                r4d, xm14
+    pextrd              r6d, xm14, 2
+    pextrd              r7d, xm14, 1
+    pextrd              r9d, xm14, 3
+    movd               r10d, xm7
+    pextrd             r11d, xm7, 2
+    pextrd             r13d, xm7, 1
+    pextrd              rXd, xm7, 3
+    pshufd               m5, m6, q1100
+    pshufd               m6, m6, q3322
+    vpblendd            m15, m9, 0xc0
+    vpblendd            m10, m8, 0xc0
+    pblendvb            m15, m11, m5
+    pblendvb            m10, m11, m6
+    vbroadcasti128      m14, [base+subpel_s_shuf8]
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    mov                 myd, mym
+    pshufb               m0, m14    ; 01a 01b
+    pshufb               m1, m14    ; 23a 23b
+    pshufb               m2, m14    ; 45a 45b
+    pshufb               m3, m14    ; 67a 67b
+    shr                 myd, 6
+    mov                 r4d, 64 << 24
+    lea                 myd, [t1+myq]
+    cmovnz              r4q, [base+subpel_filters+myq*8]
+    movq               xm14, r4q
+    punpcklbw          xm14, xm14
+    psraw              xm14, 8
+    vinserti128         m14, xm14, 1
+    mov                 r4d, [rsp+32]
+    mov                 r7d, [rsp+36]
+    pshufd               m8, m14, q0000
+    pshufd               m9, m14, q1111
+    pshufd              m11, m14, q2222
+    pshufd              m14, m14, q3333
+.dy2_vloop:
+    pmaddwd              m4, m0, m8
+    pmaddwd              m5, m1, m9
+    pmaddwd              m6, m2, m11
+    pmaddwd              m7, m3, m14
+    paddd                m4, m5
+    paddd                m6, m7
+    paddd                m4, m13
+    paddd                m4, m6
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
+    packuswb            xm4, xm4
+    movq             [dstq], xm4
+    add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
+    dec                  hd
+    jz .dy2_hloop_prep
+    mova                 m0, m1
+    mova                 m1, m2
+    mova                 m2, m3
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
+    vinserti128          m3, [srcq+r10], 1
+    vinserti128          m4, [srcq+r11], 1
+    vpbroadcastq         m5, [srcq+r13]
+    vpbroadcastq         m6, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m3, m5, 0xc0
+    vpblendd             m4, m6, 0xc0
+    pmaddubsw            m3, m15
+    pmaddubsw            m4, m10
+    phaddw               m3, m4
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
+    vinserti128          m4, [srcq+r10], 1
+    vinserti128          m5, [srcq+r11], 1
+    vpbroadcastq         m6, [srcq+r13]
+    vpbroadcastq         m7, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd             m4, m6, 0xc0
+    vpblendd             m5, m7, 0xc0
+    pmaddubsw            m4, m15
+    pmaddubsw            m5, m10
+    phaddw               m4, m5
+    psrld                m5, m3, 16
+    pslld                m6, m4, 16
+    paddw                m3, m5
+    paddw                m4, m6
+    pblendw              m3, m4, 0xaa
+    pmulhrsw             m3, m12
+    jmp .dy2_vloop
+.ret:
+    MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+    mov                 t0d, (5*15 << 16) | 5*15
+    mov                 t1d, (5*15 << 16) | 5*15
+    jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED prep
+
 %macro WARP_V 5 ; dst, 02, 46, 13, 57
     ; Can be done using gathers, but that's terribly slow on many CPU:s
     lea               tmp1d, [myq+deltaq*4]
@@ -4869,9 +6772,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
 %macro v_loop 3 ; need_left_ext, need_right_ext, suffix
 .v_loop_%3:
 %if %1
-    test           leftextq, leftextq
-    jz .body_%3
-
     ; left extension
     xor                  r3, r3
     vpbroadcastb         m0, [srcq]
@@ -4882,7 +6782,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
     jl .left_loop_%3
 
     ; body
-.body_%3:
     lea                 r12, [dstq+leftextq]
 %endif
     xor                  r3, r3
@@ -4899,8 +6798,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
 
 %if %2
     ; right extension
-    test          rightextq, rightextq
-    jz .body_loop_end_%3
 %if %1
     add                 r12, centerwq
 %else
@@ -4914,7 +6811,6 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
     cmp                  r3, rightextq
     jl .right_loop_%3
 
-.body_loop_end_%3:
 %endif
     add                dstq, dstrideq
     add                srcq, sstrideq
@@ -4985,6 +6881,147 @@ cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
 .end:
     RET
 
+cextern resize_filter
+
+INIT_YMM avx2
+cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
+                           dst_w, h, src_w, dx, mx0
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    vpbroadcastd         m5, dxm
+    vpbroadcastd         m8, mx0m
+    vpbroadcastd         m6, src_wm
+
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    LEA                  r7, $$
+%define base r7-$$
+
+    vpbroadcastd         m3, [base+pw_m256]
+    vpbroadcastd         m7, [base+pd_63]
+    vbroadcasti128      m15, [base+pb_8x0_8x8]
+    pmaddwd              m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+    pslld                m5, 3                      ; dx*8
+    pslld                m6, 14
+    paddd                m8, m2                     ; mx+[0..7]*dx
+    pxor                 m2, m2
+
+    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+    ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+    xor                  xd, xd
+    mova                 m4, m8                     ; per-line working version of mx
+
+.loop_x:
+    pmaxsd               m0, m4, m2
+    psrad                m9, m4, 8                  ; filter offset (unmasked)
+    pminsd               m0, m6                     ; iclip(mx, 0, src_w-8)
+    psubd                m1, m4, m0                 ; pshufb offset
+    psrad                m0, 14                     ; clipped src_x offset
+    psrad                m1, 14                     ; pshufb edge_emu offset
+    pand                 m9, m7                     ; filter offset (masked)
+
+    ; load source pixels - this ugly code is vpgatherdq emulation since
+    ; directly using vpgatherdq on Haswell is quite a bit slower :(
+    movd                r8d, xm0
+    pextrd              r9d, xm0, 1
+    pextrd             r10d, xm0, 2
+    pextrd             r11d, xm0, 3
+    vextracti128        xm0, m0, 1
+    movq               xm12, [srcq+r8]
+    movq               xm13, [srcq+r10]
+    movhps             xm12, [srcq+r9]
+    movhps             xm13, [srcq+r11]
+    movd                r8d, xm0
+    pextrd              r9d, xm0, 1
+    pextrd             r10d, xm0, 2
+    pextrd             r11d, xm0, 3
+    vinserti128         m12, [srcq+r8], 1
+    vinserti128         m13, [srcq+r10], 1
+    vpbroadcastq        m10, [srcq+r9]
+    vpbroadcastq        m11, [srcq+r11]
+    vpblendd            m12, m12, m10, 11000000b
+    vpblendd            m13, m13, m11, 11000000b
+
+    ; if no emulation is required, we don't need to shuffle or emulate edges
+    ; this also saves 2 quasi-vpgatherdqs
+    vptest               m1, m1
+    jz .filter
+
+    movd                r8d, xm1
+    pextrd              r9d, xm1, 1
+    pextrd             r10d, xm1, 2
+    pextrd             r11d, xm1, 3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    vextracti128        xm1, m1, 1
+    movq               xm14, [base+resize_shuf+4+r8]
+    movq                xm0, [base+resize_shuf+4+r10]
+    movhps             xm14, [base+resize_shuf+4+r9]
+    movhps              xm0, [base+resize_shuf+4+r11]
+    movd                r8d, xm1
+    pextrd              r9d, xm1, 1
+    pextrd             r10d, xm1, 2
+    pextrd             r11d, xm1, 3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    vinserti128         m14, [base+resize_shuf+4+r8], 1
+    vinserti128          m0, [base+resize_shuf+4+r10], 1
+    vpbroadcastq        m10, [base+resize_shuf+4+r9]
+    vpbroadcastq        m11, [base+resize_shuf+4+r11]
+    vpblendd            m14, m14, m10, 11000000b
+    vpblendd             m0, m0, m11, 11000000b
+
+    paddb               m14, m15
+    paddb                m0, m15
+    pshufb              m12, m14
+    pshufb              m13, m0
+
+.filter:
+    movd                r8d, xm9
+    pextrd              r9d, xm9, 1
+    pextrd             r10d, xm9, 2
+    pextrd             r11d, xm9, 3
+    vextracti128        xm9, m9, 1
+    movq               xm10, [base+resize_filter+r8*8]
+    movq               xm11, [base+resize_filter+r10*8]
+    movhps             xm10, [base+resize_filter+r9*8]
+    movhps             xm11, [base+resize_filter+r11*8]
+    movd                r8d, xm9
+    pextrd              r9d, xm9, 1
+    pextrd             r10d, xm9, 2
+    pextrd             r11d, xm9, 3
+    vinserti128         m10, [base+resize_filter+r8*8], 1
+    vinserti128         m11, [base+resize_filter+r10*8], 1
+    vpbroadcastq        m14, [base+resize_filter+r9*8]
+    vpbroadcastq         m1, [base+resize_filter+r11*8]
+    vpblendd            m10, m10, m14, 11000000b
+    vpblendd            m11, m11, m1, 11000000b
+
+    pmaddubsw           m12, m10
+    pmaddubsw           m13, m11
+    phaddw              m12, m13
+    vextracti128       xm13, m12, 1
+    phaddsw            xm12, xm13
+    pmulhrsw           xm12, xm3                    ; x=(x+64)>>7
+    packuswb           xm12, xm12
+    movq          [dstq+xq], xm12
+
+    paddd                m4, m5
+    add                  xd, 8
+    cmp                  xd, dst_wd
+    jl .loop_x
+
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+    jg .loop_y
+    RET
+
 INIT_YMM avx2
 PREP_BILIN
 PREP_8TAP
@@ -5501,6 +7538,7 @@ cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
     jg .w128_loop
     RET
 
+%if HAVE_AVX512ICL
 INIT_ZMM avx512icl
 PREP_BILIN
 PREP_8TAP
@@ -6023,4 +8061,6 @@ cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
     jg .w128_loop
     RET
 
+%endif ; HAVE_AVX512ICL
+
 %endif ; ARCH_X86_64
diff --git a/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c b/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c
index 8e2f6a0ba..a01ac14ab 100644
--- a/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c
+++ b/ffmpeg/JNI/dav1d/src/x86/mc_init_tmpl.c
@@ -52,33 +52,65 @@ decl_mc_fn(dav1d_put_bilin_ssse3);
 decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_sse2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_sse2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_sse2);
 decl_mct_fn(dav1d_prep_bilin_avx512icl);
 decl_mct_fn(dav1d_prep_bilin_avx2);
 decl_mct_fn(dav1d_prep_bilin_ssse3);
+decl_mct_fn(dav1d_prep_bilin_sse2);
+
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_bilin_scaled_avx2);
+
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_bilin_scaled_avx2);
 
 decl_avg_fn(dav1d_avg_avx512icl);
 decl_avg_fn(dav1d_avg_avx2);
@@ -115,17 +147,36 @@ decl_warp8x8t_fn(dav1d_warp_affine_8x8t_sse2);
 decl_emu_edge_fn(dav1d_emu_edge_avx2);
 decl_emu_edge_fn(dav1d_emu_edge_ssse3);
 
+decl_resize_fn(dav1d_resize_avx2);
+decl_resize_fn(dav1d_resize_ssse3);
+
 COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_##suffix
 #define init_mct_fn(type, name, suffix) \
     c->mct[type] = dav1d_prep_##name##_##suffix
+#define init_mc_scaled_fn(type, name, suffix) \
+    c->mc_scaled[type] = dav1d_put_##name##_##suffix
+#define init_mct_scaled_fn(type, name, suffix) \
+    c->mct_scaled[type] = dav1d_prep_##name##_##suffix
+
     const unsigned flags = dav1d_get_cpu_flags();
 
     if(!(flags & DAV1D_X86_CPU_FLAG_SSE2))
         return;
 
 #if BITDEPTH == 8
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, sse2);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         sse2);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   sse2);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          sse2);
+
     c->warp8x8  = dav1d_warp_affine_8x8_sse2;
     c->warp8x8t = dav1d_warp_affine_8x8t_sse2;
 #endif
@@ -134,16 +185,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
         return;
 
 #if BITDEPTH == 8
-    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               ssse3);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
-    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
+    init_mc_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   ssse3);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          ssse3);
 
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        ssse3);
@@ -168,6 +219,7 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
     c->warp8x8t = dav1d_warp_affine_8x8t_ssse3;
 
     c->emu_edge = dav1d_emu_edge_ssse3;
+    c->resize = dav1d_resize_ssse3;
 #endif
 
     if(!(flags & DAV1D_X86_CPU_FLAG_SSE41))
@@ -183,16 +235,16 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
         return;
 
 #if BITDEPTH == 8
-    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
-    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
-    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               avx2);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
+    init_mc_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx2);
+    init_mc_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
+    init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
+    init_mc_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
 
     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
@@ -205,6 +257,28 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
 
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mc_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
+
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+    init_mct_scaled_fn(FILTER_2D_BILINEAR,            bilin_scaled,               avx2);
+
     c->avg = dav1d_avg_avx2;
     c->w_avg = dav1d_w_avg_avx2;
     c->mask = dav1d_mask_avx2;
@@ -219,12 +293,13 @@ COLD void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
     c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
 
     c->emu_edge = dav1d_emu_edge_avx2;
+    c->resize = dav1d_resize_avx2;
 #endif
 
     if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL))
         return;
 
-#if BITDEPTH == 8
+#if HAVE_AVX512ICL && BITDEPTH == 8
     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
diff --git a/ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm b/ffmpeg/JNI/dav1d/src/x86/mc_sse.asm
similarity index 80%
rename from ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm
rename to ffmpeg/JNI/dav1d/src/x86/mc_sse.asm
index d0f0dd30b..d98ac621e 100644
--- a/ffmpeg/JNI/dav1d/src/x86/mc_ssse3.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/mc_sse.asm
@@ -57,7 +57,17 @@ subpel_h_shufC: db 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 bilin_h_shuf4:  db 1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
 bilin_h_shuf8:  db 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
 
+pb_8x0_8x8: times 8 db 0
+            times 8 db 8
+resize_mul: dd 0, 1, 2, 3
+resize_shuf: times 5 db 0
+             db 1, 2, 3, 4, 5, 6
+             times 5+16 db 7
+
 pb_64:    times 16 db 64
+pw_m256:  times 8 dw -256
+pw_1:     times 8 dw 1
+pw_2:     times 8 dw 2
 pw_8:     times 8 dw 8
 pw_26:    times 8 dw 26
 pw_34:    times 8 dw 34
@@ -67,6 +77,7 @@ pw_2048:  times 8 dw 2048
 pw_6903:  times 8 dw 6903
 pw_8192:  times 8 dw 8192
 pd_32:    times 4 dd 32
+pd_63:    times 4 dd 63
 pd_512:   times 4 dd 512
 pd_16384: times 4 dd 16484
 pd_32768: times 4 dd 32768
@@ -108,6 +119,7 @@ BIDIR_JMP_TABLE blend_h_ssse3, 2, 4, 8, 16, 16, 16, 16
     %endrep
 %endmacro
 
+%xdefine prep_sse2 mangle(private_prefix %+ _prep_bilin_sse2.prep)
 %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
 %xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
 
@@ -146,6 +158,8 @@ BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
     %endif
 %endmacro
 
+HV_JMP_TABLE prep,  8tap,  sse2, 1,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin,  sse2, 7,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE put,   8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep,  8tap, ssse3, 1,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
@@ -729,15 +743,79 @@ cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy, bak
     lea                 t0d, [hq+(7<<16)]
     jmp .hv_w16gt
 
+%macro PSHUFB_0X1X 1-2 ; dst[, src]
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    punpcklbw            %1, %1
+    psraw                %1, 8
+    pshufd               %1, %1, q0000
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H8 2 ; dst, src
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    mova                 %2, %1
+    psrldq               %1, 1
+    punpcklbw            %1, %2
+ %endif
+%endmacro
+
+%macro PSHUFB_BILIN_H4 3 ; dst, src, tmp
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+    mova                 %2, %1
+    psrldq               %1, 1
+    punpckhbw            %3, %1, %2
+    punpcklbw            %1, %2
+    punpcklqdq           %1, %3
+ %endif
+%endmacro
+
+%macro PMADDUBSW 5 ; dst/src1, src2, zero, tmp, reset_zero
+ %if cpuflag(ssse3)
+    pmaddubsw            %1, %2
+ %else
+  %if %5 == 1
+    pxor                 %3, %3
+  %endif
+    punpckhbw            %4, %1, %3
+    punpcklbw            %1, %1, %3
+    pmaddwd              %4, %2
+    pmaddwd              %1, %2
+    packssdw             %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW 5 ; dst, src, tmp, rndval, shift
+ %if cpuflag(ssse3)
+    pmulhrsw             %1, %2
+ %else
+    punpckhwd            %3, %1, %4
+    punpcklwd            %1, %4
+    pmaddwd              %3, %2
+    pmaddwd              %1, %2
+    psrad                %3, %5
+    psrad                %1, %5
+    packssdw             %1, %3
+ %endif
+%endmacro
+
+%macro PREP_BILIN 0
+
 DECLARE_REG_TMP 3, 5, 6
 %if ARCH_X86_32
- %define base        t2-prep_ssse3
+ %define base        t2-prep%+SUFFIX
 %else
  %define base        0
 %endif
+
 cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movifnidn          mxyd, r5m ; mx
-    LEA                  t2, prep_ssse3
+    LEA                  t2, prep%+SUFFIX
     tzcnt                wd, wm
     movifnidn            hd, hm
     test               mxyd, mxyd
@@ -746,6 +824,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     test               mxyd, mxyd
     jnz .v
 .prep:
+%if notcpuflag(ssse3)
+    add                  t2, prep_ssse3 - prep_sse2
+    jmp prep_ssse3
+%else
     movzx                wd, word [t2+wq*2+table_offset(prep,)]
     add                  wq, t2
     lea            stride3q, [strideq*3]
@@ -815,10 +897,18 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     sub                  hd, 2
     jg .prep_w16
     RET
-.prep_w16gt:
+.prep_w32:
+    mov                 t2d, 1
+    jmp .prep_w32_vloop
+.prep_w64:
+    mov                 t2d, 2
+    jmp .prep_w32_vloop
+.prep_w128:
+    mov                 t2d, 4
+.prep_w32_vloop:
     mov                 t1q, srcq
-    mov                 r3q, t2q
-.prep_w16gt_hloop:
+    mov                 r3d, t2d
+.prep_w32_hloop:
     movq                 m0, [t1q+8*0]
     movq                 m1, [t1q+8*1]
     movq                 m2, [t1q+8*2]
@@ -838,45 +928,49 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     add                 t1q, 32
-    sub                 r3q, 1
-    jg .prep_w16gt_hloop
+    dec                 r3d
+    jg .prep_w32_hloop
     lea                srcq, [srcq+strideq]
-    sub                  hd, 1
-    jg .prep_w16gt
+    dec                  hd
+    jg .prep_w32_vloop
     RET
-.prep_w32:
-    mov                 t2q, 1
-    jmp .prep_w16gt
-.prep_w64:
-    mov                 t2q, 2
-    jmp .prep_w16gt
-.prep_w128:
-    mov                 t2q, 4
-    jmp .prep_w16gt
+%endif
 .h:
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
     imul               mxyd, 0xff01
+%if cpuflag(ssse3)
     mova                 m4, [base+bilin_h_shuf8]
+%endif
     add                mxyd, 16 << 8
-    movd                xm5, mxyd
+    movd                 m5, mxyd
     mov                mxyd, r6m ; my
+%if cpuflag(ssse3)
     pshuflw              m5, m5, q0000
     punpcklqdq           m5, m5
+%else
+    PSHUFB_0X1X          m5
+%endif
     test               mxyd, mxyd
     jnz .hv
 %if ARCH_X86_32
     mov                  t1, t2 ; save base reg for w4
 %endif
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+%if notcpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+    pxor                 m6, m6
+%endif
     add                  wq, t2
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
-%if ARCH_X86_32
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
     mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
+ %else
     mova                 m4, [bilin_h_shuf4]
+ %endif
 %endif
 .h_w4_loop:
     movq                 m0, [srcq+strideq*0]
@@ -884,10 +978,10 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movq                 m1, [srcq+strideq*2]
     movhps               m1, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    pshufb               m0, m4
-    pmaddubsw            m0, m5
-    pshufb               m1, m4
-    pmaddubsw            m1, m5
+    PSHUFB_BILIN_H4      m0, m4, m2
+    PMADDUBSW            m0, m5, m6, m2, 0
+    PSHUFB_BILIN_H4      m1, m4, m2
+    PMADDUBSW            m1, m5, m6, m2, 0
     mova          [tmpq+0 ], m0
     mova          [tmpq+16], m1
     add                tmpq, 32
@@ -900,14 +994,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movu                 m2, [srcq+strideq*2]
     movu                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    pshufb               m0, m4
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pshufb               m3, m4
-    pmaddubsw            m0, m5
-    pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
-    pmaddubsw            m3, m5
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
     mova        [tmpq+16*2], m2
@@ -922,14 +1016,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movu                 m2, [srcq+strideq*1+8*0]
     movu                 m3, [srcq+strideq*1+8*1]
     lea                srcq, [srcq+strideq*2]
-    pshufb               m0, m4
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pshufb               m3, m4
-    pmaddubsw            m0, m5
-    pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
-    pmaddubsw            m3, m5
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
     mova        [tmpq+16*2], m2
@@ -938,52 +1032,60 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     sub                  hd, 2
     jg .h_w16
     RET
-.h_w16gt:
+.h_w32:
+    mov                 t2d, 1 << 0
+    jmp .h_w32_vloop
+.h_w64:
+    mov                 t2d, 1 << 1
+    jmp .h_w32_vloop
+.h_w128:
+    mov                 t2d, 1 << 3
+.h_w32_vloop:
     mov                 t1q, srcq
-    mov                 r3q, t2q
-.h_w16gt_hloop:
+    mov                 r3d, t2d
+.h_w32_hloop:
     movu                 m0, [t1q+8*0]
     movu                 m1, [t1q+8*1]
     movu                 m2, [t1q+8*2]
     movu                 m3, [t1q+8*3]
-    pshufb               m0, m4
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pshufb               m3, m4
-    pmaddubsw            m0, m5
-    pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
-    pmaddubsw            m3, m5
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m0, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
     mova        [tmpq+16*2], m2
     mova        [tmpq+16*3], m3
     add                tmpq, 16*4
     add                 t1q, 32
-    sub                 r3q, 1
-    jg .h_w16gt_hloop
+    shr                 r3d, 1
+    jnz .h_w32_hloop
     lea                srcq, [srcq+strideq]
     sub                  hd, 1
-    jg .h_w16gt
+    jg .h_w32_vloop
     RET
-.h_w32:
-    mov                 t2q, 1
-    jmp .h_w16gt
-.h_w64:
-    mov                 t2q, 2
-    jmp .h_w16gt
-.h_w128:
-    mov                 t2q, 4
-    jmp .h_w16gt
 .v:
+%if notcpuflag(ssse3)
+ %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM 8
+%endif
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
     imul               mxyd, 0xff01
     add                mxyd, 16 << 8
     add                  wq, t2
     lea            stride3q, [strideq*3]
     movd                 m5, mxyd
+%if cpuflag(ssse3)
     pshuflw              m5, m5, q0000
     punpcklqdq           m5, m5
+%else
+    PSHUFB_0X1X          m5
+    pxor                 m6, m6
+%endif
     jmp                  wq
 .v_w4:
     movd                 m0, [srcq+strideq*0]
@@ -995,14 +1097,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     punpcklwd            m0, m1  ; 0 1 _ _
     punpcklwd            m1, m2  ; 1 2 _ _
     punpcklbw            m1, m0
-    pmaddubsw            m1, m5
+    PMADDUBSW            m1, m5, m6, m7, 0
     pshufd               m1, m1, q3120
     mova        [tmpq+16*0], m1
     movd                 m0, [srcq+strideq*0]
     punpcklwd            m2, m3  ; 2 3 _ _
     punpcklwd            m3, m0  ; 3 4 _ _
     punpcklbw            m3, m2
-    pmaddubsw            m3, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
     pshufd               m3, m3, q3120
     mova        [tmpq+16*1], m3
     add                tmpq, 32
@@ -1016,20 +1118,20 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movq                 m2, [srcq+strideq*1]
     movq                 m3, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    shufpd               m4, m0, m1, 0x0c ; 0 2
+    shufpd               m4, m0, m1, 0x0c       ; 0 2
     movq                 m0, [srcq+strideq*0]
-    shufpd               m2, m3, 0x0c ; 1 3
-    shufpd               m1, m0, 0x0c ; 2 4
+    shufpd               m2, m3, 0x0c           ; 1 3
+    shufpd               m1, m0, 0x0c           ; 2 4
     punpcklbw            m3, m2, m4
-    pmaddubsw            m3, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*0], m3
     punpckhbw            m3, m2, m4
-    pmaddubsw            m3, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
     mova        [tmpq+16*2], m3
     punpcklbw            m3, m1, m2
     punpckhbw            m1, m2
-    pmaddubsw            m3, m5
-    pmaddubsw            m1, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m1, m5, m6, m7, 0
     mova        [tmpq+16*1], m3
     mova        [tmpq+16*3], m1
     add                tmpq, 16*4
@@ -1043,14 +1145,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     movu                 m2, [srcq+strideq*2]
     punpcklbw            m3, m1, m0
     punpckhbw            m4, m1, m0
-    pmaddubsw            m3, m5
-    pmaddubsw            m4, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*0], m3
     mova        [tmpq+16*1], m4
     punpcklbw            m3, m2, m1
     punpckhbw            m4, m2, m1
-    pmaddubsw            m3, m5
-    pmaddubsw            m4, m5
+    PMADDUBSW            m3, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*2], m3
     mova        [tmpq+16*3], m4
     movu                 m3, [srcq+stride3q ]
@@ -1059,14 +1161,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     add                tmpq, 16*8
     punpcklbw            m1, m3, m2
     punpckhbw            m4, m3, m2
-    pmaddubsw            m1, m5
-    pmaddubsw            m4, m5
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq-16*4], m1
     mova        [tmpq-16*3], m4
     punpcklbw            m1, m0, m3
     punpckhbw            m2, m0, m3
-    pmaddubsw            m1, m5
-    pmaddubsw            m2, m5
+    PMADDUBSW            m1, m5, m6, m7, 0
+    PMADDUBSW            m2, m5, m6, m7, 0
     mova        [tmpq-16*2], m1
     mova        [tmpq-16*1], m2
     sub                  hd, 4
@@ -1075,6 +1177,14 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
 .v_w32:
     lea                 t2d, [hq+(0<<16)]
     mov                 t0d, 64
+    jmp .v_w32_start
+.v_w64:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 128
+    jmp .v_w32_start
+.v_w128:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 256
 .v_w32_start:
 %if ARCH_X86_64
  %if WIN64
@@ -1083,43 +1193,43 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     mov                  r7, tmpq
 %endif
     mov                  t1, srcq
-.v_w32_loop_h:
-    movu                 m0, [srcq+strideq*0+16*0] ; 0L
-    movu                 m1, [srcq+strideq*0+16*1] ; 0U
-.v_w32_loop_v:
-    movu                 m2, [srcq+strideq*1+16*0] ; 1L
-    movu                 m3, [srcq+strideq*1+16*1] ; 1U
+.v_w32_hloop:
+    movu                 m0, [srcq+strideq*0+16*0]
+    movu                 m1, [srcq+strideq*0+16*1]
+.v_w32_vloop:
+    movu                 m2, [srcq+strideq*1+16*0]
+    movu                 m3, [srcq+strideq*1+16*1]
     lea                srcq, [srcq+strideq*2]
     punpcklbw            m4, m2, m0
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
     punpckhbw            m4, m2, m0
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*1], m4
     punpcklbw            m4, m3, m1
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
     punpckhbw            m4, m3, m1
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*3], m4
     add                tmpq, t0q
-    movu                 m0, [srcq+strideq*0+16*0] ; 2L
-    movu                 m1, [srcq+strideq*0+16*1] ; 2U
+    movu                 m0, [srcq+strideq*0+16*0]
+    movu                 m1, [srcq+strideq*0+16*1]
     punpcklbw            m4, m0, m2
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*0], m4
     punpckhbw            m4, m0, m2
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*1], m4
     punpcklbw            m4, m1, m3
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*2], m4
     punpckhbw            m4, m1, m3
-    pmaddubsw            m4, m5
+    PMADDUBSW            m4, m5, m6, m7, 0
     mova        [tmpq+16*3], m4
     add                tmpq, t0q
     sub                  hd, 2
-    jg .v_w32_loop_v
+    jg .v_w32_vloop
     movzx                hd, t2w
     add                  t1, 32
     mov                srcq, t1
@@ -1132,62 +1242,78 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     mov               tmpmp, tmpq
 %endif
     sub                 t2d, 1<<16
-    jg .v_w32_loop_h
+    jg .v_w32_hloop
 %if WIN64
     POP                  r7
 %endif
     RET
-.v_w64:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 128
-    jmp .v_w32_start
-.v_w128:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 256
-    jmp .v_w32_start
 .hv:
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
-    %assign stack_offset stack_offset - stack_size_padded
-    WIN64_SPILL_XMM       8
+%assign stack_offset stack_offset - stack_size_padded
+%if cpuflag(ssse3)
+    WIN64_SPILL_XMM 8
+%else
+    WIN64_SPILL_XMM 10
+%endif
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+%if cpuflag(ssse3)
     shl                mxyd, 11
-    movd                xm6, mxyd
+%else
+ %if ARCH_X86_64
+    mova                 m8, [pw_8]
+ %else
+  %define m8 [pw_8]
+ %endif
+    pxor                 m7, m7
+%endif
+    movd                 m6, mxyd
     add                  wq, t2
     pshuflw              m6, m6, q0000
+%if cpuflag(ssse3)
     punpcklqdq           m6, m6
+%else
+ %if ARCH_X86_64
+    psrlw                m0, m8, 3
+    punpcklwd            m6, m0
+ %else
+    punpcklwd            m6, [base+pw_1]
+ %endif
+%endif
 %if ARCH_X86_32
     mov                  t1, t2 ; save base reg for w4
 %endif
     lea            stride3q, [strideq*3]
     jmp                  wq
 .hv_w4:
-%if ARCH_X86_32
+%if cpuflag(ssse3)
+ %if ARCH_X86_32
     mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
-%else
+ %else
     mova                 m4, [bilin_h_shuf4]
+ %endif
 %endif
-    movq                 m0, [srcq+strideq*0] ; 0 _
-    punpcklqdq           m0, m0
-    pshufb               m0, m4
-    pmaddubsw            m0, m5
+    movhps               m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H4      m0, m4, m3
+    PMADDUBSW            m0, m5, m7, m4, 0 ; _ 0
 .hv_w4_loop:
     movq                 m1, [srcq+strideq*1]
-    movhps               m1, [srcq+strideq*2] ; 1 _ 2 _
+    movhps               m1, [srcq+strideq*2]
     movq                 m2, [srcq+stride3q ]
     lea                srcq, [srcq+strideq*4]
-    movhps               m2, [srcq+strideq*0] ; 3 _ 4 _
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pmaddubsw            m1, m5           ; 1 + 2 +
-    shufpd               m3, m0, m1, 0x01 ; 0 + 1 +
-    pmaddubsw            m0, m2, m5       ; 3 + 4 +
-    shufpd               m2, m1, m0, 0x01 ; 2 + 3 +
+    movhps               m2, [srcq+strideq*0]
+    PSHUFB_BILIN_H4      m1, m4, m3
+    PSHUFB_BILIN_H4      m2, m4, m3
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 1 2
+    shufpd               m3, m0, m1, 0x01  ; 0 1
+    mova                 m0, m2
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 3 4
+    shufpd               m2, m1, m0, 0x01  ; 2 3
     psubw                m1, m3
-    pmulhrsw             m1, m6
+    PMULHRSW             m1, m6, m4, m8, 4
     paddw                m1, m3
     psubw                m3, m0, m2
-    pmulhrsw             m3, m6
+    PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m2
     mova        [tmpq+16*0], m1
     mova        [tmpq+16*1], m3
@@ -1196,46 +1322,74 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     jg .hv_w4_loop
     RET
 .hv_w8:
-    movu                 m0,     [srcq+strideq*0]
-    pshufb               m0, m4
-    pmaddubsw            m0, m5                   ; 0 +
+    movu                 m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 0
 .hv_w8_loop:
-    movu                 m1,     [srcq+strideq*1] ; 1
-    movu                 m2,     [srcq+strideq*2] ; 2
-    pshufb               m1, m4
-    pshufb               m2, m4
-    pmaddubsw            m1, m5 ; 1 +
-    pmaddubsw            m2, m5 ; 2 +
-    psubw                m3, m1, m0  ; 1-0
-    pmulhrsw             m3, m6
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m2, m4
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 1
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 2
+    psubw                m3, m1, m0
+    PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m0
-    psubw                m7, m2, m1  ; 2-1
-    pmulhrsw             m7, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m9, m7
+%endif
+    psubw                m7, m2, m1
+    PMULHRSW             m7, m6, m4, m8, 4
     paddw                m7, m1
     mova        [tmpq+16*0], m3
     mova        [tmpq+16*1], m7
-    movu                 m1,     [srcq+stride3q ] ; 3
-    lea                srcq,     [srcq+strideq*4]
-    movu                 m0,     [srcq+strideq*0] ; 4
-    pshufb               m1, m4
-    pshufb               m0, m4
-    pmaddubsw            m1, m5 ; 3 +
-    pmaddubsw            m0, m5 ; 4 +
-    psubw                m3, m1, m2  ; 3-2
-    pmulhrsw             m3, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m7, m9
+%endif
+    movu                 m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movu                 m0, [srcq+strideq*0]
+    PSHUFB_BILIN_H8      m1, m4
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m1, m5, m7, m4, ARCH_X86_32 ; 3
+    PMADDUBSW            m0, m5, m7, m4, 0           ; 4
+    psubw                m3, m1, m2
+    PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m2
-    psubw                m7, m0, m1  ; 4-3
-    pmulhrsw             m7, m6
+%if notcpuflag(ssse3) && ARCH_X86_64
+    SWAP                 m9, m7
+%endif
+    psubw                m7, m0, m1
+    PMULHRSW             m7, m6, m4, m8, 4
     paddw                m7, m1
     mova        [tmpq+16*2], m3
     mova        [tmpq+16*3], m7
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m7, m9
+ %else
+    pxor                 m7, m7
+ %endif
+%endif
     add                tmpq, 16*4
     sub                  hd, 4
     jg .hv_w8_loop
     RET
 .hv_w16:
-    lea                 t2d, [hq+(0<<16)]
+    mov                 t2d, hd
     mov                 t0d, 32
+    jmp .hv_w16_start
+.hv_w32:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 64
+    jmp .hv_w16_start
+.hv_w64:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 128
+    jmp .hv_w16_start
+.hv_w128:
+    lea                 t2d, [hq+(7<<16)]
+    mov                 t0d, 256
 .hv_w16_start:
 %if ARCH_X86_64
  %if WIN64
@@ -1244,47 +1398,47 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     mov                  r7, tmpq
 %endif
     mov                  t1, srcq
-.hv_w16_loop_h:
-    movu                 m0,     [srcq+strideq*0+8*0] ; 0L
-    movu                 m1,     [srcq+strideq*0+8*1] ; 0U
-    pshufb               m0, m4
-    pshufb               m1, m4
-    pmaddubsw            m0, m5      ; 0L +
-    pmaddubsw            m1, m5      ; 0U +
-.hv_w16_loop_v:
-    movu                 m2,     [srcq+strideq*1+8*0] ; 1L
-    pshufb               m2, m4
-    pmaddubsw            m2, m5      ; 1L +
-    psubw                m3, m2, m0  ; 1L-0L
-    pmulhrsw             m3, m6
+.hv_w16_hloop:
+    movu                 m0, [srcq+strideq*0+8*0]
+    movu                 m1, [srcq+strideq*0+8*1]
+    PSHUFB_BILIN_H8      m0, m4
+    PSHUFB_BILIN_H8      m1, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 0a
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 0b
+.hv_w16_vloop:
+    movu                 m2, [srcq+strideq*1+8*0]
+    PSHUFB_BILIN_H8      m2, m4
+    PMADDUBSW            m2, m5, m7, m4, 0 ; 1a
+    psubw                m3, m2, m0
+    PMULHRSW             m3, m6, m4, m8, 4
     paddw                m3, m0
     mova        [tmpq+16*0], m3
-    movu                 m3,     [srcq+strideq*1+8*1] ; 1U
-    lea                srcq,     [srcq+strideq*2]
-    pshufb               m3, m4
-    pmaddubsw            m3, m5      ; 1U +
-    psubw                m0, m3, m1  ; 1U-0U
-    pmulhrsw             m0, m6
+    movu                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    PSHUFB_BILIN_H8      m3, m4
+    PMADDUBSW            m3, m5, m7, m4, 0 ; 1b
+    psubw                m0, m3, m1
+    PMULHRSW             m0, m6, m4, m8, 4
     paddw                m0, m1
     mova        [tmpq+16*1], m0
     add                tmpq, t0q
-    movu                 m0,     [srcq+strideq*0+8*0] ; 2L
-    pshufb               m0, m4
-    pmaddubsw            m0, m5      ; 2L +
-    psubw                m1, m0, m2  ; 2L-1L
-    pmulhrsw             m1, m6
+    movu                 m0, [srcq+strideq*0+8*0]
+    PSHUFB_BILIN_H8      m0, m4
+    PMADDUBSW            m0, m5, m7, m4, 0 ; 2a
+    psubw                m1, m0, m2
+    PMULHRSW             m1, m6, m4, m8, 4
     paddw                m1, m2
     mova        [tmpq+16*0], m1
-    movu                 m1,     [srcq+strideq*0+8*1] ; 2U
-    pshufb               m1, m4
-    pmaddubsw            m1, m5      ; 2U +
-    psubw                m2, m1, m3  ; 2U-1U
-    pmulhrsw             m2, m6
+    movu                 m1, [srcq+strideq*0+8*1]
+    PSHUFB_BILIN_H8      m1, m4
+    PMADDUBSW            m1, m5, m7, m4, 0 ; 2b
+    psubw                m2, m1, m3
+    PMULHRSW             m2, m6, m4, m8, 4
     paddw                m2, m3
     mova        [tmpq+16*1], m2
     add                tmpq, t0q
     sub                  hd, 2
-    jg .hv_w16_loop_v
+    jg .hv_w16_vloop
     movzx                hd, t2w
     add                  t1, 16
     mov                srcq, t1
@@ -1297,23 +1451,12 @@ cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
     mov               tmpmp, tmpq
 %endif
     sub                 t2d, 1<<16
-    jg .hv_w16_loop_h
+    jg .hv_w16_hloop
 %if WIN64
     POP                  r7
 %endif
     RET
-.hv_w32:
-    lea                 t2d, [hq+(1<<16)]
-    mov                 t0d, 64
-    jmp .hv_w16_start
-.hv_w64:
-    lea                 t2d, [hq+(3<<16)]
-    mov                 t0d, 128
-    jmp .hv_w16_start
-.hv_w128:
-    lea                 t2d, [hq+(7<<16)]
-    mov                 t0d, 256
-    jmp .hv_w16_start
+%endmacro
 
 ; int8_t subpel_filters[5][15][8]
 %assign FILTER_REGULAR (0*15 << 16) | 3*15
@@ -2430,58 +2573,250 @@ cglobal put_8tap, 1, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
     jg .hv_w8_loop0
     RET
 
-%if ARCH_X86_32
-DECLARE_REG_TMP 1, 2
-%elif WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-%macro PREP_8TAP_FN 3 ; type, type_h, type_v
-cglobal prep_8tap_%1
-    mov                 t0d, FILTER_%2
-    mov                 t1d, FILTER_%3
-%ifnidn %1, sharp_smooth ; skip the jump in the last filter
-    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
-%endif
+%macro PSHUFB_SUBPEL_H_4 5 ; dst/src1, src2/mask, tmp1, tmp2, reset_mask
+ %if cpuflag(ssse3)
+    pshufb               %1, %2
+ %else
+  %if %5 == 1
+    pcmpeqd              %2, %2
+    psrlq                %2, 32
+  %endif
+    psrldq               %3, %1, 1
+    pshufd               %3, %3, q2301
+    pand                 %1, %2
+    pandn                %4, %2, %3
+    por                  %1, %4
+ %endif
 %endmacro
 
-PREP_8TAP_FN regular,        REGULAR, REGULAR
-PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
-PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
-PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
-PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
-PREP_8TAP_FN sharp,          SHARP,   SHARP
-PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+%macro PSHUFB_SUBPEL_H_4a 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %ifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
 
-%if ARCH_X86_32
- %define base_reg r2
- %define base base_reg-prep_ssse3
- %define W32_RESTORE_SSQ mov strideq, stridem
-%else
- %define base_reg r7
- %define base 0
- %define W32_RESTORE_SSQ
-%endif
+%macro PSHUFB_SUBPEL_H_4b 6 ; dst, src1, src2/mask, tmp1, tmp2, reset_mask
+ %if notcpuflag(ssse3)
+    psrlq                %1, %2, 16
+ %elifnidn %1, %2
+    mova                 %1, %2
+ %endif
+    PSHUFB_SUBPEL_H_4    %1, %3, %4, %5, %6
+%endmacro
 
-cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
-%assign org_stack_offset stack_offset
-    imul                mxd, mxm, 0x010101
+%macro PALIGNR 4-5 ; dst, src1, src2, shift[, tmp]
+ %if cpuflag(ssse3)
+    palignr              %1, %2, %3, %4
+ %else
+  %if %0 == 4
+   %assign %%i regnumof%+%1 + 1
+   %define %%tmp m %+ %%i
+  %else
+   %define %%tmp %5
+  %endif
+    psrldq               %1, %3, %4
+    pslldq            %%tmp, %2, 16-%4
+    por                  %1, %%tmp
+ %endif
+%endmacro
+
+%macro PHADDW 4 ; dst, src, pw_1/tmp, load_pw_1
+ %if cpuflag(ssse3)
+    phaddw               %1, %2
+ %else
+  %ifnidn %1, %2
+   %if %4 == 1
+    mova                 %3, [pw_1]
+   %endif
+    pmaddwd              %1, %3
+    pmaddwd              %2, %3
+    packssdw             %1, %2
+  %else
+   %if %4 == 1
+    pmaddwd              %1, [pw_1]
+   %else
+    pmaddwd              %1, %3
+   %endif
+    packssdw             %1, %1
+  %endif
+ %endif
+%endmacro
+
+%macro PMULHRSW_POW2 4 ; dst, src1, src2, shift
+ %if cpuflag(ssse3)
+    pmulhrsw             %1, %2, %3
+ %else
+    paddw                %1, %2, %3
+    psraw                %1, %4
+ %endif
+%endmacro
+
+%macro PMULHRSW_8192 3 ; dst, src1, src2
+    PMULHRSW_POW2        %1, %2, %3, 2
+%endmacro
+
+%macro PREP_8TAP_H_LOAD4 5 ; dst, src_memloc, tmp[1-2]
+   movd                  %1, [%2+0]
+   movd                  %3, [%2+1]
+   movd                  %4, [%2+2]
+   movd                  %5, [%2+3]
+   punpckldq             %1, %3
+   punpckldq             %4, %5
+   punpcklqdq            %1, %4
+%endmacro
+
+%macro PREP_8TAP_H_LOAD 2 ; dst0, src_memloc
+ %if cpuflag(ssse3)
+    movu                m%1, [%2]
+    pshufb               m2, m%1, m11 ; subpel_h_shufB
+    pshufb               m3, m%1, m9  ; subpel_h_shufC
+    pshufb              m%1, m10      ; subpel_h_shufA
+ %else
+  %if ARCH_X86_64
+    SWAP                m12, m5
+    SWAP                m13, m6
+    SWAP                m14, m7
+   %define %%mx0 m%+%%i
+   %define %%mx1 m%+%%j
+   %assign %%i 0
+   %rep 12
+    movd              %%mx0, [%2+%%i]
+    %assign %%i %%i+1
+   %endrep
+   %assign %%i 0
+   %rep 6
+    %assign %%j %%i+1
+    punpckldq         %%mx0, %%mx1
+    %assign %%i %%i+2
+   %endrep
+   %assign %%i 0
+   %rep 3
+    %assign %%j %%i+2
+    punpcklqdq        %%mx0, %%mx1
+    %assign %%i %%i+4
+   %endrep
+    SWAP                m%1, m0
+    SWAP                 m2, m4
+    SWAP                 m3, m8
+    SWAP                 m5, m12
+    SWAP                 m6, m13
+    SWAP                 m7, m14
+  %else
+    PREP_8TAP_H_LOAD4    m0, %2+0, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, m4, m7
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, m4, m7
+    SWAP                m%1, m0
+  %endif
+ %endif
+%endmacro
+
+%macro PREP_8TAP_H 2 ; dst, src_memloc
+    PREP_8TAP_H_LOAD     %1, %2
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m8, m1
+    SWAP                 m9, m7
+ %endif
+ %xdefine mX m%+%1
+ %assign %%i regnumof%+mX
+ %define mX m%+%%i
+    mova                 m4, m2
+    PMADDUBSW            m4, m5, m1, m7, 1  ; subpel +0 B0
+    PMADDUBSW            m2, m6, m1, m7, 0  ; subpel +4 B4
+    PMADDUBSW            m3, m6, m1, m7, 0  ; subpel +4 C4
+    PMADDUBSW            mX, m5, m1, m7, 0  ; subpel +0 A0
+ %undef mX
+ %if ARCH_X86_64 && notcpuflag(ssse3)
+    SWAP                 m1, m8
+    SWAP                 m7, m9
+ %endif
+    paddw                m3, m4
+    paddw               m%1, m2
+    PHADDW              m%1, m3, m15, ARCH_X86_32
+ %if ARCH_X86_64 || cpuflag(ssse3)
+    PMULHRSW_8192       m%1, m%1, m7
+ %else
+    PMULHRSW_8192       m%1, m%1, [base+pw_2]
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV_LOAD 4 ; dst0, src_memloc, tmp[1-2]
+ %if cpuflag(ssse3)
+    movu                 %1, [%2]
+    pshufb               m2, %1, shufB
+    pshufb               m3, %1, shufC
+    pshufb               %1, shufA
+ %else
+    PREP_8TAP_H_LOAD4    %1, %2+0, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m2, %2+4, m1, %3, %4
+    PREP_8TAP_H_LOAD4    m3, %2+8, m1, %3, %4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_HV 4 ; dst, src_memloc, tmp[1-2]
+    PREP_8TAP_HV_LOAD %{1:4}
+    mova                 m1, m2
+    PMADDUBSW            m1, subpelh0, %3, %4, 1 ; subpel +0 C0
+    PMADDUBSW            m3, subpelh1, %3, %4, 0 ; subpel +4 B4
+    PMADDUBSW            m2, subpelh1, %3, %4, 0 ; C4
+    PMADDUBSW            %1, subpelh0, %3, %4, 0 ; A0
+    paddw                m1, m3           ; C0+B4
+    paddw                %1, m2           ; A0+C4
+    PHADDW               %1, m1, %3, 1
+%endmacro
+
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+    mov                 t0d, FILTER_%2
+    mov                 t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+    jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PREP_8TAP 0
+%if ARCH_X86_32
+ DECLARE_REG_TMP 1, 2
+%elif WIN64
+ DECLARE_REG_TMP 6, 4
+%else
+ DECLARE_REG_TMP 6, 7
+%endif
+PREP_8TAP_FN regular,        REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_FN sharp,          SHARP,   SHARP
+PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep%+SUFFIX
+ %define W32_RESTORE_SSQ mov strideq, stridem
+%else
+ %define base_reg r7
+ %define base 0
+ %define W32_RESTORE_SSQ
+%endif
+cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+    imul                mxd, mxm, 0x010101
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
     add                 myd, t1d ; 8tap_v, my, 4tap_v
     movsxd               wq, wm
     movifnidn          srcd, srcm
     movifnidn            hd, hm
-    LEA            base_reg, prep_ssse3
     test                mxd, 0xf00
     jnz .h
     test                myd, 0xf00
     jnz .v
+    LEA            base_reg, prep_ssse3
     tzcnt                wd, wd
-    movzx                wd, word [base_reg+wq*2+table_offset(prep,)]
+    movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
     add                  wq, base_reg
     movifnidn       strided, stridem
     lea                  r6, [strideq*3]
@@ -2492,25 +2827,49 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %endif
     jmp                  wq
 .h:
+    LEA            base_reg, prep%+SUFFIX
     test                myd, 0xf00
     jnz .hv
+%if cpuflag(ssse3)
     WIN64_SPILL_XMM      12
+%else
+    WIN64_SPILL_XMM      16
+%endif
     cmp                  wd, 4
     je .h_w4
     tzcnt                wd, wd
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
     mova                m10, [base+subpel_h_shufA]
     mova                m11, [base+subpel_h_shufB]
     mova                 m9, [base+subpel_h_shufC]
+ %else
+  %define m10 [base+subpel_h_shufA]
+  %define m11 [base+subpel_h_shufB]
+  %define m9  [base+subpel_h_shufC]
+ %endif
 %endif
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
-    movd                 m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+    movd                 m5, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+0]
     pshufd               m5, m5, q0000
-    movd                 m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+    movd                 m6, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+4]
     pshufd               m6, m6, q0000
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
+%else
+    punpcklbw            m5, m5
+    punpcklbw            m6, m6
+    psraw                m5, 8
+    psraw                m6, 8
+ %if ARCH_X86_64
+    mova                 m7, [pw_2]
+    mova                m15, [pw_1]
+ %else
+  %define m15 m4
+ %endif
+%endif
     add                  wq, base_reg
     jmp                  wq
 .h_w4:
@@ -2520,39 +2879,115 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     movzx               mxd, mxb
 %endif
     dec                srcq
-    movd                 m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+    movd                 m4, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
     pshufd               m4, m4, q0000
+%if cpuflag(ssse3)
     mova                 m6, [base+pw_8192]
     mova                 m5, [base+subpel_h_shufA]
+%else
+    mova                 m6, [base+pw_2]
+ %if ARCH_X86_64
+    mova                m14, [pw_1]
+ %else
+  %define m14 m7
+ %endif
+    punpcklbw            m4, m4
+    psraw                m4, 8
+%endif
     W32_RESTORE_SSQ
 %if ARCH_X86_64
     lea            stride3q, [strideq*3]
 %endif
 .h_w4_loop:
+%if cpuflag(ssse3)
     movq                 m0, [srcq+strideq*0] ; 0
     movq                 m1, [srcq+strideq*1] ; 1
-%if ARCH_X86_32
+ %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
     movq                 m2, [srcq+strideq*0] ; 2
     movq                 m3, [srcq+strideq*1] ; 3
     lea                srcq, [srcq+strideq*2]
-%else
+ %else
     movq                 m2, [srcq+strideq*2] ; 2
     movq                 m3, [srcq+stride3q ] ; 3
     lea                srcq, [srcq+strideq*4]
-%endif
-    pshufb               m0, m5 ; subpel_h_shufA
+ %endif
+    pshufb               m0, m5
     pshufb               m1, m5
     pshufb               m2, m5
     pshufb               m3, m5
-    pmaddubsw            m0, m4 ; subpel_filters + 2
-    pmaddubsw            m1, m4
-    pmaddubsw            m2, m4
-    pmaddubsw            m3, m4
-    phaddw               m0, m1
-    phaddw               m2, m3
-    pmulhrsw             m0, m6 ; pw_8192
-    pmulhrsw             m2, m6 ; pw_8192
+%else
+ %if ARCH_X86_64
+    movd                 m0, [srcq+strideq*0+0]
+    movd                m12, [srcq+strideq*0+1]
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m5, [srcq+strideq*1+1]
+    movd                 m2, [srcq+strideq*2+0]
+    movd                m13, [srcq+strideq*2+1]
+    movd                 m3, [srcq+stride3q +0]
+    movd                 m7, [srcq+stride3q +1]
+    punpckldq            m0, m12
+    punpckldq            m1, m5
+    punpckldq            m2, m13
+    punpckldq            m3, m7
+    movd                m12, [srcq+strideq*0+2]
+    movd                 m8, [srcq+strideq*0+3]
+    movd                 m5, [srcq+strideq*1+2]
+    movd                 m9, [srcq+strideq*1+3]
+    movd                m13, [srcq+strideq*2+2]
+    movd                m10, [srcq+strideq*2+3]
+    movd                 m7, [srcq+stride3q +2]
+    movd                m11, [srcq+stride3q +3]
+    lea                srcq, [srcq+strideq*4]
+    punpckldq           m12, m8
+    punpckldq            m5, m9
+    punpckldq           m13, m10
+    punpckldq            m7, m11
+    punpcklqdq           m0, m12 ; 0
+    punpcklqdq           m1, m5  ; 1
+    punpcklqdq           m2, m13 ; 2
+    punpcklqdq           m3, m7  ; 3
+ %else
+    movd                 m0, [srcq+strideq*0+0]
+    movd                 m1, [srcq+strideq*0+1]
+    movd                 m2, [srcq+strideq*0+2]
+    movd                 m3, [srcq+strideq*0+3]
+    punpckldq            m0, m1
+    punpckldq            m2, m3
+    punpcklqdq           m0, m2 ; 0
+    movd                 m1, [srcq+strideq*1+0]
+    movd                 m2, [srcq+strideq*1+1]
+    movd                 m3, [srcq+strideq*1+2]
+    movd                 m7, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m1, m2
+    punpckldq            m3, m7
+    punpcklqdq           m1, m3 ; 1
+    movd                 m2, [srcq+strideq*0+0]
+    movd                 m3, [srcq+strideq*0+1]
+    movd                 m7, [srcq+strideq*0+2]
+    movd                 m5, [srcq+strideq*0+3]
+    punpckldq            m2, m3
+    punpckldq            m7, m5
+    punpcklqdq           m2, m7 ; 2
+    movd                 m3, [srcq+strideq*1+0]
+    movd                 m7, [srcq+strideq*1+1]
+    punpckldq            m3, m7
+    movd                 m7, [srcq+strideq*1+2]
+    movd                 m5, [srcq+strideq*1+3]
+    lea                srcq, [srcq+strideq*2]
+    punpckldq            m7, m5
+    punpcklqdq           m3, m7 ; 3
+ %endif
+%endif
+    PMADDUBSW            m0, m4, m5, m7, 1 ; subpel_filters + 2
+    PMADDUBSW            m1, m4, m5, m7, 0
+    PMADDUBSW            m2, m4, m5, m7, 0
+    PMADDUBSW            m3, m4, m5, m7, 0
+    PHADDW               m0, m1, m14, ARCH_X86_32
+    PHADDW               m2, m3, m14, 0
+    PMULHRSW_8192        m0, m0, m6
+    PMULHRSW_8192        m2, m2, m6
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m2
     add                tmpq, 32
@@ -2560,55 +2995,41 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     jg .h_w4_loop
     RET
     ;
-%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
-%if ARCH_X86_32
-    pshufb               %2, %1, [base+subpel_h_shufB]
-    pshufb               %3, %1, [base+subpel_h_shufC]
-    pshufb               %1,     [base+subpel_h_shufA]
-%else
-    pshufb               %2, %1, m11; subpel_h_shufB
-    pshufb               %3, %1, m9 ; subpel_h_shufC
-    pshufb               %1, m10    ; subpel_h_shufA
-%endif
-    pmaddubsw            %4, %2, m5  ; subpel +0 B0
-    pmaddubsw            %2, m6      ; subpel +4 B4
-    pmaddubsw            %3, m6      ; subpel +4 C4
-    pmaddubsw            %1, m5      ; subpel +0 A0
-    paddw                %3, %4
-    paddw                %1, %2
-    phaddw               %1, %3
-    pmulhrsw             %1, m7      ; 8192
-%endmacro
-    ;
 .h_w8:
 %if ARCH_X86_32
     mov                  r3, r2
-    %define        base_reg  r3
+ %define           base_reg  r3
     W32_RESTORE_SSQ
 %endif
 .h_w8_loop:
-    movu                 m0,     [srcq+strideq*0]
-    movu                 m1,     [srcq+strideq*1]
-    lea                srcq,     [srcq+strideq*2]
-    PREP_8TAP_H          m0, m2, m3, m4
-    PREP_8TAP_H          m1, m2, m3, m4
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+strideq*0
+    PREP_8TAP_H           1, srcq+strideq*1
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
+    lea                srcq, [srcq+strideq*2]
     add                tmpq, 32
     sub                  hd, 2
+%else
+    PREP_8TAP_H           0, srcq
+    mova             [tmpq], m0
+    add                srcq, strideq
+    add                tmpq, 16
+    dec                  hd
+%endif
     jg .h_w8_loop
     RET
 .h_w16:
-    xor                 r6d, r6d
+    mov                  r6, -16*1
     jmp .h_start
 .h_w32:
-    mov                  r6, -16*1
+    mov                  r6, -16*2
     jmp .h_start
 .h_w64:
-    mov                  r6, -16*3
+    mov                  r6, -16*4
     jmp .h_start
 .h_w128:
-    mov                  r6, -16*7
+    mov                  r6, -16*8
 .h_start:
 %if ARCH_X86_32
     mov                  r3, r2
@@ -2618,15 +3039,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     mov                  r5, r6
     W32_RESTORE_SSQ
 .h_loop:
-    movu                 m0,     [srcq+r6+8*0]
-    movu                 m1,     [srcq+r6+8*1]
-    PREP_8TAP_H          m0, m2, m3, m4
-    PREP_8TAP_H          m1, m2, m3, m4
+%if cpuflag(ssse3)
+    PREP_8TAP_H           0, srcq+r6+8*0
+    PREP_8TAP_H           1, srcq+r6+8*1
     mova        [tmpq+16*0], m0
     mova        [tmpq+16*1], m1
     add                tmpq, 32
     add                  r6, 16
-    jle .h_loop
+%else
+    PREP_8TAP_H           0, srcq+r6
+    mova             [tmpq], m0
+    add                tmpq, 16
+    add                  r6, 8
+%endif
+    jl .h_loop
     add                srcq, strideq
     mov                  r6, r5
     dec                  hd
@@ -2635,8 +3061,9 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %if ARCH_X86_32
  %define            base_reg r2
 %endif
-
+    ;
 .v:
+    LEA            base_reg, prep%+SUFFIX
 %if ARCH_X86_32
     mov                 mxd, myd
     and                 mxd, 0x7f
@@ -2648,30 +3075,40 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    lea                 myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    lea                 myq, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+%if cpuflag(ssse3)
     mova                 m2, [base+pw_512]
     psrlw                m2, m2, 1 ; 0x0100
     mova                 m7, [base+pw_8192]
+%endif
 %if ARCH_X86_32
  %define            subpel0  [rsp+mmsize*0]
  %define            subpel1  [rsp+mmsize*1]
  %define            subpel2  [rsp+mmsize*2]
  %define            subpel3  [rsp+mmsize*3]
 %assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ %if cpuflag(ssse3)
     ALLOC_STACK   -mmsize*4
+ %else
+    ALLOC_STACK   -mmsize*5
+ %endif
 %assign regs_used 7
     movd                 m0, [myq+0]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel0, m0
     movd                 m0, [myq+2]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel1, m0
     movd                 m0, [myq+4]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel2, m0
     movd                 m0, [myq+6]
-    pshufb               m0, m2
+    PSHUFB_0X1X          m0, m2
     mova            subpel3, m0
+ %if notcpuflag(ssse3)
+    mov                  r6, base_reg
+  %define base_reg r6
+ %endif
     mov             strideq, [rstk+stack_offset+gprsize*3]
     lea             strideq, [strideq*3]
     sub [rstk+stack_offset+gprsize*2], strideq
@@ -2683,25 +3120,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
  %define            subpel2  m10
  %define            subpel3  m11
     movd            subpel0, [myq+0]
-    pshufb          subpel0, m2
+    PSHUFB_0X1X     subpel0, m2
     movd            subpel1, [myq+2]
-    pshufb          subpel1, m2
+    PSHUFB_0X1X     subpel1, m2
     movd            subpel2, [myq+4]
-    pshufb          subpel2, m2
+    PSHUFB_0X1X     subpel2, m2
     movd            subpel3, [myq+6]
-    pshufb          subpel3, m2
+    PSHUFB_0X1X     subpel3, m2
     lea            stride3q, [strideq*3]
     sub                srcq, stride3q
     cmp                  wd, 8
-    jg .v_w16
-    je .v_w8
+    jns .v_w8
 %endif
 .v_w4:
-%if ARCH_X86_32
-%if STACK_ALIGNMENT < mmsize
- %define               srcm [rsp+mmsize*4+gprsize*1]
- %define               tmpm [rsp+mmsize*4+gprsize*2]
+%if notcpuflag(ssse3)
+    pxor                 m6, m6
+ %if ARCH_X86_64
+    mova                 m7, [base+pw_2]
+ %endif
 %endif
+%if ARCH_X86_32
+ %if STACK_ALIGNMENT < mmsize
+  %define srcm [esp+stack_size+gprsize*1]
+  %define tmpm [esp+stack_size+gprsize*2]
+ %endif
     mov                tmpm, tmpq
     mov                srcm, srcq
     lea                 r5d, [wq - 4] ; horizontal loop
@@ -2734,17 +3176,30 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %endif
     punpckldq            m3, m1           ; 4 5 _ _
     punpckldq            m1, m0           ; 5 6 _ _
-    palignr              m4, m3, m2, 4    ; 1 2 3 4
+    PALIGNR              m4, m3, m2, 4    ; 1 2 3 4
     punpcklbw            m3, m1           ; 45 56
     punpcklbw            m1, m2, m4       ; 01 12
     punpckhbw            m2, m4           ; 23 34
 .v_w4_loop:
-    pmaddubsw            m5, m1, subpel0  ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel0
+ %define subpel0 m7
+%endif
+    mova                 m5, m1
+    PMADDUBSW            m5, subpel0, m6, m4, 0  ; a0 b0
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel1
+ %define subpel1 m7
+%endif
     mova                 m1, m2
-    pmaddubsw            m2, subpel1      ; a1 b1
+    PMADDUBSW            m2, subpel1, m6, m4, 0  ; a1 b1
     paddw                m5, m2
+%if ARCH_X86_32 && notcpuflag(ssse3)
+    mova                 m7, subpel2
+ %define subpel2 m7
+%endif
     mova                 m2, m3
-    pmaddubsw            m3, subpel2      ; a2 b2
+    PMADDUBSW            m3, subpel2, m6, m4, 0  ; a2 b2
     paddw                m5, m3
     movd                 m4, [srcq+strideq*0]
     punpckldq            m3, m0, m4       ; 6 7 _ _
@@ -2752,9 +3207,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     lea                srcq, [srcq+strideq*2]
     punpckldq            m4, m0           ; 7 8 _ _
     punpcklbw            m3, m4           ; 67 78
-    pmaddubsw            m4, m3, subpel3  ; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m12, m0
+ %else
+    mova     [esp+mmsize*4], m0
+    mova                 m7, subpel3
+  %define subpel3 m7
+ %endif
+%endif
+    mova                 m4, m3
+    PMADDUBSW            m4, subpel3, m6, m0, 0  ; a3 b3
     paddw                m5, m4
-    pmulhrsw             m5, m7
+%if ARCH_X86_64 || cpuflag(ssse3)
+ %if notcpuflag(ssse3)
+    SWAP                 m0, m12
+ %endif
+    PMULHRSW_8192        m5, m5, m7
+%else
+    mova                 m0, [esp+mmsize*4]
+    PMULHRSW_8192        m5, m5, [base+pw_2]
+%endif
     movq        [tmpq+wq*0], m5
     movhps      [tmpq+wq*2], m5
     lea                tmpq, [tmpq+wq*4]
@@ -2772,26 +3245,28 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     jg .v_w4_loop0
 %endif
     RET
-
+%if ARCH_X86_32 && notcpuflag(ssse3)
+ %define base_reg r2
+%endif
+    ;
 %if ARCH_X86_64
 .v_w8:
-.v_w16:
     lea                 r5d, [wq - 8] ; horizontal loop
     mov                  r8, tmpq
     mov                  r6, srcq
     shl                 r5d, 8 - 3; (wq / 8) << 8
     mov                 r5b, hb
 .v_w8_loop0:
-    movq                 m4, [srcq+strideq*0]   ; 0
-    movq                 m5, [srcq+strideq*1]   ; 1
+    movq                 m4, [srcq+strideq*0]
+    movq                 m5, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                 m6, [srcq+strideq*0]   ; 2
-    movq                 m0, [srcq+strideq*1]   ; 3
+    movq                 m6, [srcq+strideq*0]
+    movq                 m0, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                 m1, [srcq+strideq*0]
+    movq                 m2, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                 m1, [srcq+strideq*0]   ; 4
-    movq                 m2, [srcq+strideq*1]   ; 5
-    lea                srcq, [srcq+strideq*2]   ;
-    movq                 m3, [srcq+strideq*0]   ; 6
+    movq                 m3, [srcq+strideq*0]
     shufpd               m4, m0, 0x0c
     shufpd               m5, m1, 0x0c
     punpcklbw            m1, m4, m5 ; 01
@@ -2803,9 +3278,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     punpcklbw            m3, m6, m0 ; 23
     punpckhbw            m6, m0     ; 56
 .v_w8_loop:
-    movq                m12, [srcq+strideq*1]   ; 8
+%if cpuflag(ssse3)
+    movq                m12, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
-    movq                m13, [srcq+strideq*0]   ; 9
+    movq                m13, [srcq+strideq*0]
     pmaddubsw           m14, m1, subpel0 ; a0
     pmaddubsw           m15, m2, subpel0 ; b0
     mova                 m1, m3
@@ -2830,8 +3306,43 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     paddw               m15, m13
     pmulhrsw            m14, m7
     pmulhrsw            m15, m7
-    movu        [tmpq+wq*0], xm14
-    movu        [tmpq+wq*2], xm15
+    movu        [tmpq+wq*0], m14
+    movu        [tmpq+wq*2], m15
+%else
+    mova                m14, m1
+    PMADDUBSW           m14, subpel0, m7, m12, 1 ; a0
+    mova                 m1, m3
+    PMADDUBSW            m3, subpel1, m7, m12, 0 ; a1
+    paddw               m14, m3
+    mova                 m3, m5
+    PMADDUBSW            m5, subpel2, m7, m12, 0 ; a2
+    paddw               m14, m5
+    movq                m12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movq                m13, [srcq+strideq*0]
+    shufpd              m15, m0, m12, 0x0d
+    shufpd               m0, m12, m13, 0x0c
+    punpcklbw            m5, m15, m0  ; 67
+    punpckhbw           m15, m0       ; 78
+    mova                m13, m5
+    PMADDUBSW           m13, subpel3, m7, m12, 0 ; a3
+    paddw               m14, m13
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*0], m14
+    mova                m14, m2
+    PMADDUBSW           m14, subpel0, m7, m12, 0 ; b0
+    mova                 m2, m4
+    PMADDUBSW            m4, subpel1, m7, m12, 0 ; b1
+    paddw               m14, m4
+    mova                 m4, m6
+    PMADDUBSW            m6, subpel2, m7, m12, 0 ; b2
+    paddw               m14, m6
+    mova                 m6, m15
+    PMADDUBSW           m15, subpel3, m7, m12, 0 ; b3
+    paddw               m14, m15
+    PMULHRSW_8192       m14, m14, [base+pw_2]
+    movu        [tmpq+wq*2], m14
+%endif
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .v_w8_loop
@@ -2848,20 +3359,20 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %undef subpel1
 %undef subpel2
 %undef subpel3
-
+    ;
 .hv:
     %assign stack_offset org_stack_offset
     cmp                  wd, 4
     jg .hv_w8
     and                 mxd, 0x7f
-    movd                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+    movd                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX+2]
 %if ARCH_X86_32
     mov                 mxd, myd
     shr                 myd, 16
     and                 mxd, 0x7f
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     mov                  r5, r2; use as new base
  %define           base_reg  r5
  %assign regs_used 2
@@ -2877,7 +3388,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
  %define           subpelv2  [rsp+mmsize*2]
  %define           subpelv3  [rsp+mmsize*3]
     punpcklbw            m0, m0
-    psraw                m0, 8 ; sign-extend
+    psraw                m0, 8
     pshufd               m6, m0, q0000
     mova           subpelv0, m6
     pshufd               m6, m0, q1111
@@ -2891,8 +3402,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
+ %if cpuflag(ssse3)
     ALLOC_STACK   mmsize*14, 14
+ %else
+    ALLOC_STACK   mmsize*14, 16
+ %endif
     lea            stride3q, [strideq*3]
     sub                srcq, stride3q
     dec                srcq
@@ -2901,8 +3416,12 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
  %define           subpelv2  m12
  %define           subpelv3  m13
     punpcklbw            m0, m0
-    psraw                m0, 8 ; sign-extend
+    psraw                m0, 8
+ %if cpuflag(ssse3)
     mova                 m8, [base+pw_8192]
+ %else
+    mova                 m8, [base+pw_2]
+ %endif
     mova                 m9, [base+pd_32]
     pshufd              m10, m0, q0000
     pshufd              m11, m0, q1111
@@ -2910,7 +3429,10 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     pshufd              m13, m0, q3333
 %endif
     pshufd               m7, m1, q0000
-.hv_w4:
+%if notcpuflag(ssse3)
+    punpcklbw            m7, m7
+    psraw                m7, 8
+%endif
 %define hv4_line_0_0 4
 %define hv4_line_0_1 5
 %define hv4_line_0_2 6
@@ -2921,17 +3443,27 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %define hv4_line_1_1 11
 %define hv4_line_1_2 12
 %define hv4_line_1_3 13
-    ;
-    ;
 %if ARCH_X86_32
- %define           w8192reg  [base+pw_8192]
+ %if cpuflag(ssse3)
+  %define           w8192reg  [base+pw_8192]
+ %else
+  %define           w8192reg  [base+pw_2]
+ %endif
  %define             d32reg  [base+pd_32]
 %else
  %define           w8192reg  m8
  %define             d32reg  m9
 %endif
     ; lower shuffle 0 1 2 3 4
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%else
+ %if ARCH_X86_64
+    mova                m15, [pw_1]
+ %else
+  %define               m15 m1
+ %endif
+%endif
     movq                 m5, [srcq+strideq*0]   ; 0 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 0 _ 1 _
     movq                 m4, [srcq+strideq*2]   ; 2 _ _ _
@@ -2944,43 +3476,61 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     movhps               m4, [srcq+stride3q ]   ; 2 _ 3 _
     lea                srcq, [srcq+strideq*4]
 %endif
-    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
-    pmaddubsw            m2, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m2, m0 ;H 0 1 2 3
-    pmulhrsw             m2, w8192reg ;H pw_8192
+    PSHUFB_SUBPEL_H_4a   m2, m5, m6, m1, m3, 1    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
     SAVELINE_W4          m2, 2, 0
     ; upper shuffle 2 3 4 5 6
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
-    pshufb               m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
-    pmaddubsw            m2, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m2, m0 ;H 0 1 2 3
-    pmulhrsw             m2, w8192reg ;H pw_8192
-    ;
+%endif
+    PSHUFB_SUBPEL_H_4b   m2, m5, m6, m1, m3, 0    ;H subpel_h_shuf4 0~1~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m3, 0    ;H subpel_h_shuf4 2~3~
+    PMADDUBSW            m2, m7, m1, m3, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m3, 0        ;H subpel_filters
+    PHADDW               m2, m0, m15, ARCH_X86_32 ;H 0 1 2 3
+    PMULHRSW_8192        m2, m2, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m2
+ %else
+    mova     [esp+mmsize*4], m2
+ %endif
+%endif
     ; lower shuffle
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%endif
     movq                 m5, [srcq+strideq*0]   ; 4 _ _ _
     movhps               m5, [srcq+strideq*1]   ; 4 _ 5 _
     movq                 m4, [srcq+strideq*2]   ; 6 _ _ _
-    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
-    pmaddubsw            m3, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m3, m0 ;H 4 5 6 7
-    pmulhrsw             m3, w8192reg ;H pw_8192
+    PSHUFB_SUBPEL_H_4a   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4a   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
     SAVELINE_W4          m3, 3, 0
     ; upper shuffle
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
-    pshufb               m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
-    pshufb               m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
-    pmaddubsw            m3, m7 ;H subpel_filters
-    pmaddubsw            m0, m7 ;H subpel_filters
-    phaddw               m3, m0 ;H 4 5 6 7
-    pmulhrsw             m3, w8192reg ;H pw_8192
-    ;
+%endif
+    PSHUFB_SUBPEL_H_4b   m3, m5, m6, m1, m2, 0    ;H subpel_h_shuf4 4~5~
+    PSHUFB_SUBPEL_H_4b   m0, m4, m6, m1, m2, 0    ;H subpel_h_shuf4 6~6~
+    PMADDUBSW            m3, m7, m1, m2, 1        ;H subpel_filters
+    PMADDUBSW            m0, m7, m1, m2, 0        ;H subpel_filters
+    PHADDW               m3, m0, m15, ARCH_X86_32 ;H 4 5 6 7
+    PMULHRSW_8192        m3, m3, w8192reg
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m2, m14
+ %else
+    mova                 m2, [esp+mmsize*4]
+ %endif
+%endif
 %if ARCH_X86_32
     lea                srcq, [srcq+strideq*2]
     add                srcq, strideq
@@ -2988,7 +3538,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     add                srcq, stride3q
 %endif
     ;process high
-    palignr              m4, m3, m2, 4;V 1 2 3 4
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
     punpckhwd            m2, m4      ; V 23 34
     pshufd               m0, m3, q2121;V 5 6 5 6
@@ -3000,7 +3550,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     ;process low
     RESTORELINE_W4       m2, 2, 0
     RESTORELINE_W4       m3, 3, 0
-    palignr              m4, m3, m2, 4;V 1 2 3 4
+    PALIGNR              m4, m3, m2, 4;V 1 2 3 4
     punpcklwd            m1, m2, m4  ; V 01 12
     punpckhwd            m2, m4      ; V 23 34
     pshufd               m0, m3, q2121;V 5 6 5 6
@@ -3014,18 +3564,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova     [esp+mmsize*4], m5
+  %define m15 m3
+ %endif
+%endif
     ;
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4]
+%endif
     movq                 m4, [srcq+strideq*0] ; 7
     movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
-    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
-    pmaddubsw            m4, m7 ;H subpel_filters
-    phaddw               m4, m4 ;H                7 8 7 8
-    pmulhrsw             m4, w8192reg ;H pw_8192
-    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    PSHUFB_SUBPEL_H_4a   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
     mova                 m0, m4
     punpcklwd            m3, m4      ; 67 78
     pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+mmsize*4]
+ %endif
+%endif
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m5, 6
@@ -3046,18 +3613,34 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     mova                 m2, m3
     pmaddwd              m3, subpelv2; V a2 b2
     paddd                m5, m3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                m14, m5
+ %else
+    mova         [esp+0xA0], m5
+ %endif
+%endif
     ;
+%if cpuflag(ssse3)
     mova                 m6, [base+subpel_h_shuf4+16]
+%endif
     movq                 m4, [srcq+strideq*0] ; 7
     movhps               m4, [srcq+strideq*1] ; 7 _ 8 _
-    pshufb               m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
-    pmaddubsw            m4, m7 ;H subpel_filters
-    phaddw               m4, m4 ;H                7 8 7 8
-    pmulhrsw             m4, w8192reg ;H pw_8192
-    palignr              m3, m4, m0, 12         ; 6 7 8 7
+    PSHUFB_SUBPEL_H_4b   m4, m4, m6, m3, m5, 0    ; H subpel_h_shuf4 7~8~
+    PMADDUBSW            m4, m7, m3, m5, 1        ; H subpel_filters
+    PHADDW               m4, m4, m15, ARCH_X86_32 ; H                7878
+    PMULHRSW_8192        m4, m4, w8192reg
+    PALIGNR              m3, m4, m0, 12, m5       ;                  6787
     mova                 m0, m4
     punpcklwd            m3, m4      ; 67 78
     pmaddwd              m4, m3, subpelv3; a3 b3
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m5, m14
+ %else
+    mova                 m5, [esp+0xA0]
+ %endif
+%endif
     paddd                m5, d32reg ; pd_32
     paddd                m5, m4
     psrad                m4, m5, 6
@@ -3084,8 +3667,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %undef subpelv2
 %undef subpelv3
     ;
-
-
 .hv_w8:
     %assign stack_offset org_stack_offset
 %define hv8_line_1 0
@@ -3104,27 +3685,35 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
  %define           subpelv3  [rsp+mmsize*10]
  %define             accuv0  [rsp+mmsize*11]
  %define             accuv1  [rsp+mmsize*12]
-    movq                 m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movq                 m1, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
     mov                 mxd, myd
     shr                 myd, 16
     and                 mxd, 0x7f
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m5, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     ALLOC_STACK  -mmsize*13
-%if STACK_ALIGNMENT < mmsize
+ %if STACK_ALIGNMENT < mmsize
     mov                rstk, r2m
- %define               tmpm  [rsp+mmsize*13+gprsize*1]
- %define               srcm  [rsp+mmsize*13+gprsize*2]
- %define            stridem  [rsp+mmsize*13+gprsize*3]
+  %define               tmpm  [rsp+mmsize*13+gprsize*1]
+  %define               srcm  [rsp+mmsize*13+gprsize*2]
+  %define            stridem  [rsp+mmsize*13+gprsize*3]
     mov             stridem, rstk
-%endif
+ %endif
     mov                  r6, r2
-%define base_reg r6
+ %define base_reg r6
     pshufd               m0, m1, q0000
     pshufd               m1, m1, q1111
     punpcklbw            m5, m5
-    psraw                m5, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+    punpcklbw            m0, m0
+    punpcklbw            m1, m1
+ %endif
+    psraw                m5, 8
+ %if notcpuflag(ssse3)
+    psraw                m0, 8
+    psraw                m1, 8
+ %endif
     pshufd               m2, m5, q0000
     pshufd               m3, m5, q1111
     pshufd               m4, m5, q2222
@@ -3151,20 +3740,31 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
  %define           subpelv3  m15
  %define             accuv0  m8
  %define             accuv1  m9
-    movq                 m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+    movq                 m0, [base_reg+mxq*8+subpel_filters-prep%+SUFFIX]
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
     cmovs               myd, mxd
-    movq                 m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+    movq                 m1, [base_reg+myq*8+subpel_filters-prep%+SUFFIX]
     pshufd         subpelh0, m0, q0000
     pshufd         subpelh1, m0, q1111
     punpcklbw            m1, m1
-    psraw                m1, 8 ; sign-extend
+ %if notcpuflag(ssse3)
+    punpcklbw      subpelh0, subpelh0
+    punpcklbw      subpelh1, subpelh1
+ %endif
+    psraw                m1, 8
+ %if notcpuflag(ssse3)
+    psraw          subpelh0, 8
+    psraw          subpelh1, 8
+ %endif
     pshufd         subpelv0, m1, q0000
     pshufd         subpelv1, m1, q1111
     pshufd         subpelv2, m1, q2222
     pshufd         subpelv3, m1, q3333
+ %if notcpuflag(ssse3)
+    mova                 m7, [base+pw_2]
+ %endif
     lea                stride3q, [strideq*3]
     sub                srcq, 3
     sub                srcq, stride3q
@@ -3179,57 +3779,89 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     shl                 r5d, (16 - 2)
     mov                 r5w, hw
 .hv_w8_loop0:
-    movu                 m4, [srcq+strideq*0] ; 0 = _ _
-    movu                 m5, [srcq+strideq*1] ; 1 = _ _
-    lea                srcq, [srcq+strideq*2]
-%if ARCH_X86_64
+%if cpuflag(ssse3)
+ %if ARCH_X86_64
     mova                 m7, [base+subpel_h_shufA]
     mova                 m8, [base+subpel_h_shufB]
     mova                 m9, [base+subpel_h_shufC]
+  %define shufA m7
+  %define shufB m8
+  %define shufC m9
+ %else
+  %define shufA [base+subpel_h_shufA]
+  %define shufB [base+subpel_h_shufB]
+  %define shufC [base+subpel_h_shufC]
+ %endif
 %endif
-    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
-    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
-    movu                 m6, [srcq+strideq*0] ; 2 = _ _
-    movu                 m0, [srcq+strideq*1] ; 3 = _ _
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
     lea                srcq, [srcq+strideq*2]
-    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
-    HV_H_W8              m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
-    ;
+%if notcpuflag(ssse3)
+ %if ARCH_X86_64
+    SWAP                 m9, m4
+ %else
+    mova              [esp], m4
+ %endif
+%endif
+    PREP_8TAP_HV         m6, srcq+strideq*0, m7, m4
+    PREP_8TAP_HV         m0, srcq+strideq*1, m7, m4
+    lea                srcq, [srcq+strideq*2]
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-    pmulhrsw             m4, m7 ; H pw_8192
-    pmulhrsw             m5, m7 ; H pw_8192
-    pmulhrsw             m6, m7 ; H pw_8192
-    pmulhrsw             m0, m7 ; H pw_8192
-    punpcklwd            m1, m4, m5  ; 0 1 ~
-    punpcklwd            m2, m5, m6  ; 1 2 ~
-    punpcklwd            m3, m6, m0  ; 2 3 ~
+%else
+    mova                 m7, [base+pw_2]
+ %if ARCH_X86_64
+    SWAP                 m4, m9
+ %else
+    mova                 m4, [esp]
+ %endif
+%endif
+    PMULHRSW_8192        m4, m4, m7
+    PMULHRSW_8192        m5, m5, m7
+    PMULHRSW_8192        m6, m6, m7
+    PMULHRSW_8192        m0, m0, m7
+    punpcklwd            m1, m4, m5 ; 01
+    punpcklwd            m2, m5, m6 ; 12
+    punpcklwd            m3, m6, m0 ; 23
     SAVELINE_W8           1, m1
     SAVELINE_W8           2, m2
     SAVELINE_W8           3, m3
-    ;
+%if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufA]
-    movu                 m4, [srcq+strideq*0]       ; 4 = _ _
-    movu                 m5, [srcq+strideq*1]       ; 5 = _ _
+%else
+ %if ARCH_X86_64
+    SWAP                 m8, m7
+    SWAP                 m9, m0
+ %else
+    mova         [esp+0x30], m0
+ %endif
+%endif
+    PREP_8TAP_HV         m4, srcq+strideq*0, m7, m0
+    PREP_8TAP_HV         m5, srcq+strideq*1, m7, m0
+    PREP_8TAP_HV         m6, srcq+strideq*2, m7, m0
     lea                srcq, [srcq+strideq*2]
-    movu                 m6, [srcq+strideq*0]       ; 6 = _ _
-    HV_H_W8              m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
-    HV_H_W8              m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
-    HV_H_W8              m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+%if cpuflag(ssse3)
     mova                 m7, [base+pw_8192]
-    pmulhrsw             m1, m4, m7 ; H pw_8192 4 ~
-    pmulhrsw             m2, m5, m7 ; H pw_8192 5 ~
-    pmulhrsw             m3, m6, m7 ; H pw_8192 6 ~
-    punpcklwd            m4, m0, m1  ; 3 4 ~
-    punpcklwd            m5, m1, m2  ; 4 5 ~
-    punpcklwd            m6, m2, m3  ; 5 6 ~
-    ;
+%else
+ %if ARCH_X86_64
+    SWAP                 m0, m9
+    SWAP                 m7, m8
+ %else
+    mova                 m0, [esp+0x30]
+    mova                 m7, [base+pw_2]
+ %endif
+%endif
+    PMULHRSW_8192        m1, m4, m7
+    PMULHRSW_8192        m2, m5, m7
+    PMULHRSW_8192        m3, m6, m7
+    punpcklwd            m4, m0, m1 ; 34
+    punpcklwd            m5, m1, m2 ; 45
+    punpcklwd            m6, m2, m3 ; 56
     SAVELINE_W8           6, m3
     RESTORELINE_W8        1, m1
     RESTORELINE_W8        2, m2
     RESTORELINE_W8        3, m3
 .hv_w8_loop:
-    ; m8 accu for V a
-    ; m9 accu for V b
     SAVELINE_W8           1, m3
     SAVELINE_W8           2, m4
     SAVELINE_W8           3, m5
@@ -3246,46 +3878,53 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     paddd                m0, m5
     paddd                m7, m6
     mova                 m5, [base+pd_32]
-    paddd                m0, m5 ;   pd_512
-    paddd                m7, m5 ;   pd_512
+    paddd                m0, m5
+    paddd                m7, m5
     mova             accuv0, m0
     mova             accuv1, m7
 %else
-    pmaddwd              m8, m1, subpelv0 ; a0
-    pmaddwd              m9, m2, subpelv0 ; b0
+    pmaddwd          accuv0, m1, subpelv0 ; a0
+    pmaddwd          accuv1, m2, subpelv0 ; b0
     pmaddwd              m3, subpelv1     ; a1
     pmaddwd              m4, subpelv1     ; b1
-    paddd                m8, m3
-    paddd                m9, m4
+    paddd            accuv0, m3
+    paddd            accuv1, m4
     pmaddwd              m5, subpelv2     ; a2
     pmaddwd              m6, subpelv2     ; b2
-    paddd                m8, m5
-    paddd                m9, m6
+    paddd            accuv0, m5
+    paddd            accuv1, m6
     mova                 m7, [base+pd_32]
-    paddd                m8, m7 ;   pd_512
-    paddd                m9, m7 ;   pd_512
+    paddd            accuv0, m7
+    paddd            accuv1, m7
+ %if cpuflag(ssse3)
     mova                 m7, [base+subpel_h_shufB]
     mova                 m6, [base+subpel_h_shufC]
     mova                 m5, [base+subpel_h_shufA]
+  %define shufA m5
+  %define shufB m7
+  %define shufC m6
+ %endif
 %endif
-    movu                 m0, [srcq+strideq*1] ; 7
-    movu                 m4, [srcq+strideq*2] ; 8
+    PREP_8TAP_HV         m0, srcq+strideq*1, m5, m6
+    PREP_8TAP_HV         m4, srcq+strideq*2, m5, m6
     lea                srcq, [srcq+strideq*2]
-    HV_H_W8              m0, m1, m2, m3, m5, m7, m6
-    HV_H_W8              m4, m1, m2, m3, m5, m7, m6
+%if cpuflag(ssse3)
     mova                 m5, [base+pw_8192]
-    pmulhrsw             m0, m5 ; H pw_8192
-    pmulhrsw             m4, m5 ; H pw_8192
+%else
+    mova                 m5, [base+pw_2]
+%endif
+    PMULHRSW_8192        m0, m0, m5
+    PMULHRSW_8192        m4, m4, m5
     RESTORELINE_W8        6, m6
-    punpcklwd            m5, m6, m0  ; 6 7  ~
-    punpcklwd            m6, m0, m4  ; 7 8 ~
+    punpcklwd            m5, m6, m0 ; 67
+    punpcklwd            m6, m0, m4 ; 78
     pmaddwd              m1, m5, subpelv3 ; a3
     paddd                m2, m1, accuv0
     pmaddwd              m1, m6, subpelv3 ; b3
-    paddd                m1, m1, accuv1 ; H + V
+    paddd                m1, m1, accuv1
     psrad                m2, 6
     psrad                m1, 6
-    packssdw             m2, m1      ; d -> w
+    packssdw             m2, m1
     movq        [tmpq+wq*0], m2
     movhps      [tmpq+wq*2], m2
     lea                tmpq, [tmpq+wq*4]
@@ -3314,6 +3953,7 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
     sub                 r5d, 1<<16
     jg .hv_w8_loop0
     RET
+%endmacro
 
 %if ARCH_X86_32
  %macro SAVE_ALPHA_BETA 0
@@ -3384,7 +4024,6 @@ cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
 %endmacro
 
 %macro WARP_V 10 ; dst0, dst1, 0, 2, 4, 6, 1, 3, 5, 7
-    ; Can be done using gathers, but that's terribly slow on many CPU:s
  %if ARCH_X86_32
   %define m8  m4
   %define m9  m5
@@ -4022,20 +4661,6 @@ ALIGN function_align
     ret
 %endmacro
 
-INIT_XMM sse4
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM sse2
-WARP_AFFINE_8X8
-WARP_AFFINE_8X8T
-
-INIT_XMM ssse3
-
 %if WIN64
 DECLARE_REG_TMP 6, 4
 %else
@@ -4872,8 +5497,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
     mov                  r1, r1m
   %endif
 %if %1
-    test           leftextq, leftextq
-    jz .body_%3
     ; left extension
   %if ARCH_X86_64
     movd                 m0, [srcq]
@@ -4889,7 +5512,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
     cmp                  r3, leftextq
     jl .left_loop_%3
     ; body
-.body_%3:
     lea             reg_tmp, [dstq+leftextq]
 %endif
     xor                  r3, r3
@@ -4910,13 +5532,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
     jl .body_loop_%3
 %if %2
     ; right extension
-  %if ARCH_X86_64
-    test          rightextq, rightextq
-  %else
-    mov                  r1, r3m
-    test                 r1, r1
-  %endif
-    jz .body_loop_end_%3
 %if %1
     add             reg_tmp, centerwq
 %else
@@ -4939,7 +5554,6 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
     cmp                  r3, r3m
   %endif
     jl .right_loop_%3
-.body_loop_end_%3:
 %endif
   %if ARCH_X86_64
     add                dstq, dstrideq
@@ -5081,3 +5695,241 @@ cglobal emu_edge, 10, 13, 2, bw, bh, iw, ih, x, \
 %undef reg_dstride
 %undef reg_blkm
 %undef reg_tmp
+
+cextern resize_filter
+
+%macro SCRATCH 3
+%if ARCH_X86_32
+    mova [rsp+%3*mmsize], m%1
+%define m%2 [rsp+%3*mmsize]
+%else
+    SWAP             %1, %2
+%endif
+%endmacro
+
+%if ARCH_X86_64
+cglobal resize, 0, 14, 16, dst, dst_stride, src, src_stride, \
+                           dst_w, h, src_w, dx, mx0
+%elif STACK_ALIGNMENT >= 16
+cglobal resize, 0, 7, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+                                 dst_w, h, src_w, dx, mx0
+%else
+cglobal resize, 0, 6, 8, 3 * 16, dst, dst_stride, src, src_stride, \
+                                 dst_w, h, src_w, dx, mx0
+%endif
+    movifnidn          dstq, dstmp
+    movifnidn          srcq, srcmp
+%if STACK_ALIGNMENT >= 16
+    movifnidn        dst_wd, dst_wm
+%endif
+%if ARCH_X86_64
+    movifnidn            hd, hm
+%endif
+    sub          dword mx0m, 4<<14
+    sub        dword src_wm, 8
+    movd                 m7, dxm
+    movd                 m6, mx0m
+    movd                 m5, src_wm
+    pshufd               m7, m7, q0000
+    pshufd               m6, m6, q0000
+    pshufd               m5, m5, q0000
+
+%if ARCH_X86_64
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+    LEA                  r7, $$
+%define base r7-$$
+%else
+    DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
+%if STACK_ALIGNMENT >= 16
+    LEA                  r6, $$
+%define base r6-$$
+%else
+    LEA                  r4, $$
+%define base r4-$$
+%endif
+%endif
+
+%if ARCH_X86_64
+    mova                m12, [base+pw_m256]
+    mova                m11, [base+pd_63]
+    mova                m10, [base+pb_8x0_8x8]
+%else
+%define m12 [base+pw_m256]
+%define m11 [base+pd_63]
+%define m10 [base+pb_8x0_8x8]
+%endif
+    pmaddwd              m4, m7, [base+resize_mul]  ; dx*[0,1,2,3]
+    pslld                m7, 2                      ; dx*4
+    pslld                m5, 14
+    paddd                m6, m4                     ; mx+[0..3]*dx
+    SCRATCH               7, 15, 0
+    SCRATCH               6, 14, 1
+    SCRATCH               5, 13, 2
+
+    ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+    ; m8 = mx+[0..3]*dx, m5 = dx*4, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+    xor                  xd, xd
+    mova                 m0, m14                    ; per-line working version of mx
+
+.loop_x:
+    pxor                 m1, m1
+    pcmpgtd              m1, m0
+    pandn                m1, m0
+    psrad                m2, m0, 8                  ; filter offset (unmasked)
+    pcmpgtd              m3, m13, m1
+    pand                 m1, m3
+    pandn                m3, m13
+    por                  m1, m3
+    psubd                m3, m0, m1                 ; pshufb offset
+    psrad                m1, 14                     ; clipped src_x offset
+    psrad                m3, 14                     ; pshufb edge_emu offset
+    pand                 m2, m11                    ; filter offset (masked)
+
+    ; load source pixels
+%if ARCH_X86_64
+    movd                r8d, xm1
+    pshuflw             xm1, xm1, q3232
+    movd                r9d, xm1
+    punpckhqdq          xm1, xm1
+    movd               r10d, xm1
+    psrlq               xm1, 32
+    movd               r11d, xm1
+    movq                xm4, [srcq+r8]
+    movq                xm5, [srcq+r10]
+    movhps              xm4, [srcq+r9]
+    movhps              xm5, [srcq+r11]
+%else
+    movd                r3d, xm1
+    pshufd              xm1, xm1, q3312
+    movd                r1d, xm1
+    pshuflw             xm1, xm1, q3232
+    movq                xm4, [srcq+r3]
+    movq                xm5, [srcq+r1]
+    movd                r3d, xm1
+    punpckhqdq          xm1, xm1
+    movd                r1d, xm1
+    movhps              xm4, [srcq+r3]
+    movhps              xm5, [srcq+r1]
+%endif
+
+    ; if no emulation is required, we don't need to shuffle or emulate edges
+    ; this also saves 2 quasi-vpgatherdqs
+    pxor                 m6, m6
+    pcmpeqb              m6, m3
+%if ARCH_X86_64
+    pmovmskb            r8d, m6
+    cmp                 r8d, 0xffff
+%else
+    pmovmskb            r3d, m6
+    cmp                 r3d, 0xffff
+%endif
+    je .filter
+
+%if ARCH_X86_64
+    movd                r8d, xm3
+    pshuflw             xm3, xm3, q3232
+    movd                r9d, xm3
+    punpckhqdq          xm3, xm3
+    movd               r10d, xm3
+    psrlq               xm3, 32
+    movd               r11d, xm3
+    movsxd               r8, r8d
+    movsxd               r9, r9d
+    movsxd              r10, r10d
+    movsxd              r11, r11d
+    movq                xm6, [base+resize_shuf+4+r8]
+    movq                xm7, [base+resize_shuf+4+r10]
+    movhps              xm6, [base+resize_shuf+4+r9]
+    movhps              xm7, [base+resize_shuf+4+r11]
+%else
+    movd                r3d, xm3
+    pshufd              xm3, xm3, q3312
+    movd                r1d, xm3
+    pshuflw             xm3, xm3, q3232
+    movq                xm6, [base+resize_shuf+4+r3]
+    movq                xm7, [base+resize_shuf+4+r1]
+    movd                r3d, xm3
+    punpckhqdq          xm3, xm3
+    movd                r1d, xm3
+    movhps              xm6, [base+resize_shuf+4+r3]
+    movhps              xm7, [base+resize_shuf+4+r1]
+%endif
+
+    paddb                m6, m10
+    paddb                m7, m10
+    pshufb               m4, m6
+    pshufb               m5, m7
+
+.filter:
+%if ARCH_X86_64
+    movd                r8d, xm2
+    pshuflw             xm2, xm2, q3232
+    movd                r9d, xm2
+    punpckhqdq          xm2, xm2
+    movd               r10d, xm2
+    psrlq               xm2, 32
+    movd               r11d, xm2
+    movq                xm6, [base+resize_filter+r8*8]
+    movq                xm7, [base+resize_filter+r10*8]
+    movhps              xm6, [base+resize_filter+r9*8]
+    movhps              xm7, [base+resize_filter+r11*8]
+%else
+    movd                r3d, xm2
+    pshufd              xm2, xm2, q3312
+    movd                r1d, xm2
+    pshuflw             xm2, xm2, q3232
+    movq                xm6, [base+resize_filter+r3*8]
+    movq                xm7, [base+resize_filter+r1*8]
+    movd                r3d, xm2
+    punpckhqdq          xm2, xm2
+    movd                r1d, xm2
+    movhps              xm6, [base+resize_filter+r3*8]
+    movhps              xm7, [base+resize_filter+r1*8]
+%endif
+
+    pmaddubsw            m4, m6
+    pmaddubsw            m5, m7
+    phaddw               m4, m5
+    phaddsw              m4, m4
+    pmulhrsw             m4, m12                    ; x=(x+64)>>7
+    packuswb             m4, m4
+    movd          [dstq+xq], m4
+
+    paddd                m0, m15
+    add                  xd, 4
+%if STACK_ALIGNMENT >= 16
+    cmp                  xd, dst_wd
+%else
+    cmp                  xd, dst_wm
+%endif
+    jl .loop_x
+
+%if ARCH_X86_64
+    add                dstq, dst_strideq
+    add                srcq, src_strideq
+    dec                  hd
+%else
+    add                dstq, dst_stridem
+    add                srcq, src_stridem
+    dec           dword r5m
+%endif
+    jg .loop_y
+    RET
+
+INIT_XMM ssse3
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse4
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
+
+INIT_XMM sse2
+PREP_BILIN
+PREP_8TAP
+WARP_AFFINE_8X8
+WARP_AFFINE_8X8T
diff --git a/ffmpeg/JNI/dav1d/src/x86/msac.asm b/ffmpeg/JNI/dav1d/src/x86/msac.asm
index f67871483..756e19b4b 100644
--- a/ffmpeg/JNI/dav1d/src/x86/msac.asm
+++ b/ffmpeg/JNI/dav1d/src/x86/msac.asm
@@ -157,7 +157,7 @@ cglobal msac_decode_symbol_adapt4, 0, 6, 6
     mov [t7+msac.rng], t2d
     not            t4
     sub           t1d, ecx
-    jge .end ; no refill required
+    jae .end ; no refill required
 
 ; refill:
     mov            t2, [t7+msac.buf]
@@ -504,7 +504,7 @@ cglobal msac_decode_bool, 0, 6, 0
     mov [t7+msac.rng], t2d
     not            t4
     sub           t5d, ecx
-    jge %%end
+    jae %%end
     mov            t2, [t7+msac.buf]
     mov           rcx, [t7+msac.end]
 %if UNIX64 == 0
diff --git a/ffmpeg/JNI/dav1d/src/x86/msac_init.c b/ffmpeg/JNI/dav1d/src/x86/msac_init.c
index a9dafc757..a634da27c 100644
--- a/ffmpeg/JNI/dav1d/src/x86/msac_init.c
+++ b/ffmpeg/JNI/dav1d/src/x86/msac_init.c
@@ -28,6 +28,7 @@
 #include "src/msac.h"
 #include "src/x86/msac.h"
 
+#if ARCH_X86_64
 void dav1d_msac_init_x86(MsacContext *const s) {
     const unsigned flags = dav1d_get_cpu_flags();
 
@@ -39,4 +40,4 @@ void dav1d_msac_init_x86(MsacContext *const s) {
         s->symbol_adapt16 = dav1d_msac_decode_symbol_adapt16_avx2;
     }
 }
-
+#endif
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S
index 0c4e31f40..a186ef8fc 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_32.S
@@ -48,6 +48,8 @@ error_message_gpr:
         .asciz "failed to preserve register r%d"
 error_message_vfp:
         .asciz "failed to preserve register d%d"
+error_message_stack:
+        .asciz "failed to preserve stack"
 endconst
 
 @ max number of args used by any asm function.
@@ -55,8 +57,9 @@ endconst
 
 #define ARG_STACK 4*(MAX_ARGS - 4)
 
-@ align the used stack space to 8 to preserve the stack alignment
-#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed)
+@ Align the used stack space to 8 to preserve the stack alignment.
+@ +8 for stack canary reference.
+#define ARG_STACK_A (((ARG_STACK + pushed + 7) & ~7) - pushed + 8)
 
 .macro clobbercheck variant
 .equ pushed, 4*9
@@ -83,14 +86,37 @@ function checked_call_\variant, export=1
 .equ pos, pos + 4
 .endr
 
+        @ For stack overflows, the callee is free to overwrite the parameters
+        @ that were passed on the stack (if any), so we can only check after
+        @ that point. First figure out how many parameters the function
+        @ really took on the stack:
+        ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
+        @ Load the first non-parameter value from the stack, that should be
+        @ left untouched by the function. Store a copy of it inverted, so that
+        @ e.g. overwriting everything with zero would be noticed.
+        ldr             r12, [sp, r12, lsl #2]
+        mvn             r12, r12
+        str             r12, [sp, #ARG_STACK_A - 4]
+
         mov             r12, r0
         mov             r0,  r2
         mov             r1,  r3
         ldrd            r2,  r3,  [sp, #ARG_STACK_A + pushed]
+        @ Call the target function
         blx             r12
-        add             sp,  sp,  #ARG_STACK_A
 
+        @ Load the number of stack parameters, stack canary and its reference
+        ldr             r12, [sp, #ARG_STACK_A + pushed + 8 + 4*(MAX_ARGS-4)]
+        ldr             r2,  [sp, r12, lsl #2]
+        ldr             r3,  [sp, #ARG_STACK_A - 4]
+
+        add             sp,  sp,  #ARG_STACK_A
         push            {r0, r1}
+
+        mvn             r3,  r3
+        cmp             r2,  r3
+        bne             5f
+
         movrel          r12, register_init
 .ifc \variant, vfp
 .macro check_reg_vfp, dreg, offset
@@ -144,6 +170,9 @@ function checked_call_\variant, export=1
 .purgem check_reg
 
         b               0f
+5:
+        movrel          r0, error_message_stack
+        b               1f
 4:
         movrel          r0, error_message_vfp
         b               1f
@@ -154,9 +183,9 @@ function checked_call_\variant, export=1
         movrel          r0, error_message_gpr
 1:
 #ifdef PREFIX
-        blx             _checkasm_fail_func
+        bl              _checkasm_fail_func
 #else
-        blx             checkasm_fail_func
+        bl              checkasm_fail_func
 #endif
 0:
         pop             {r0, r1}
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S
index 11a342389..25749145a 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/arm/checkasm_64.S
@@ -53,8 +53,10 @@ const register_init, align=4
 endconst
 
 
-const error_message
+const error_message_register
         .asciz "failed to preserve register"
+error_message_stack:
+        .asciz "stack clobbered"
 endconst
 
 
@@ -74,7 +76,8 @@ function stack_clobber, export=1
         ret
 endfunc
 
-#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15)
+// + 16 for stack canary reference
+#define ARG_STACK ((8*(MAX_ARGS - 8) + 15) & ~15 + 16)
 
 function checked_call, export=1
         stp             x29, x30, [sp, #-16]!
@@ -109,22 +112,56 @@ function checked_call, export=1
 .equ pos, pos + 8
 .endr
 
+        // Fill x8-x17 with garbage. This doesn't have to be preserved,
+        // but avoids relying on them having any particular value.
+        movrel          x9, register_init
+        ldp             x10, x11, [x9], #32
+        ldp             x12, x13, [x9], #32
+        ldp             x14, x15, [x9], #32
+        ldp             x16, x17, [x9], #32
+        ldp             x8,  x9,  [x9]
+
+        // For stack overflows, the callee is free to overwrite the parameters
+        // that were passed on the stack (if any), so we can only check after
+        // that point. First figure out how many parameters the function
+        // really took on the stack:
+        ldr             w2,  [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
+        // Load the first non-parameter value from the stack, that should be
+        // left untouched by the function. Store a copy of it inverted, so that
+        // e.g. overwriting everything with zero would be noticed.
+        ldr             x2,  [sp, x2, lsl #3]
+        mvn             x2,  x2
+        str             x2,  [sp, #ARG_STACK-8]
+
+        // Load the in-register arguments
         mov             x12, x0
         ldp             x0,  x1,  [x29, #16]
         ldp             x2,  x3,  [x29, #32]
         ldp             x4,  x5,  [x29, #48]
         ldp             x6,  x7,  [x29, #64]
+        // Call the target function
         blr             x12
+
+        // Load the number of stack parameters, stack canary and its reference
+        ldr             w2,  [x29, #16 + 8*8 + (MAX_ARGS-8)*8]
+        ldr             x2,  [sp, x2, lsl #3]
+        ldr             x3,  [sp, #ARG_STACK-8]
+
         add             sp,  sp,  #ARG_STACK
         stp             x0,  x1,  [sp, #-16]!
+
+        mvn             x3,  x3
+        cmp             x2,  x3
+        b.ne            2f
+
         movrel          x9, register_init
         movi            v3.8h,  #0
 
 .macro check_reg_neon reg1, reg2
-        ldr             q0,  [x9], #16
-        uzp1            v1.2d,  v\reg1\().2d, v\reg2\().2d
-        eor             v0.16b, v0.16b, v1.16b
-        orr             v3.16b, v3.16b, v0.16b
+        ldr             q1,  [x9], #16
+        uzp1            v2.2d,  v\reg1\().2d, v\reg2\().2d
+        eor             v1.16b, v1.16b, v2.16b
+        orr             v3.16b, v3.16b, v1.16b
 .endm
         check_reg_neon  8,  9
         check_reg_neon  10, 11
@@ -148,7 +185,11 @@ function checked_call, export=1
 
         cbz             x3,  0f
 
-        movrel          x0, error_message
+        movrel          x0, error_message_register
+        b               1f
+2:
+        movrel          x0, error_message_stack
+1:
 #ifdef PREFIX
         bl              _checkasm_fail_func
 #else
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c
index de9d733e0..ee52c8969 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.c
@@ -125,7 +125,7 @@ typedef struct CheckasmFunc {
     struct CheckasmFunc *child[2];
     CheckasmFuncVersion versions;
     uint8_t color; /* 0 = red, 1 = black */
-    char name[1];
+    char name[];
 } CheckasmFunc;
 
 /* Internal state */
@@ -142,7 +142,7 @@ static struct {
     unsigned cpu_flag;
     const char *cpu_flag_name;
     const char *test_name;
-    unsigned int seed;
+    unsigned seed;
     int bench_c;
     int verbose;
     int function_listing;
@@ -159,7 +159,7 @@ typedef union {
 
 static uint32_t xs_state[4];
 
-static void xor128_srand(unsigned int seed) {
+static void xor128_srand(unsigned seed) {
     xs_state[0] = seed;
     xs_state[1] = ( seed & 0xffff0000) | (~seed & 0x0000ffff);
     xs_state[2] = (~seed & 0xffff0000) | ( seed & 0x0000ffff);
@@ -335,15 +335,15 @@ static int cmp_nop(const void *a, const void *b) {
 /* Measure the overhead of the timing code (in decicycles) */
 static int measure_nop_time(void) {
     uint16_t nops[10000];
-    int i, nop_sum = 0;
+    int nop_sum = 0;
 
-    for (i = 0; i < 10000; i++) {
+    for (int i = 0; i < 10000; i++) {
         uint64_t t = readtime();
         nops[i] = (uint16_t) (readtime() - t);
     }
 
     qsort(nops, 10000, sizeof(uint16_t), cmp_nop);
-    for (i = 2500; i < 7500; i++)
+    for (int i = 2500; i < 7500; i++)
         nop_sum += nops[i];
 
     return nop_sum / 500;
@@ -359,8 +359,8 @@ static void print_benchs(const CheckasmFunc *const f) {
             const CheckasmFuncVersion *v = &f->versions;
             do {
                 if (v->iterations) {
-                    int decicycles = (int) (10*v->cycles/v->iterations -
-                                            state.nop_time) / 4;
+                    const int decicycles = (int) (10*v->cycles/v->iterations -
+                                                  state.nop_time) / 4;
                     printf("%s_%s: %d.%d\n", f->name, cpu_suffix(v->cpu),
                            decicycles/10, decicycles%10);
                 }
@@ -413,7 +413,7 @@ static CheckasmFunc *rotate_tree(CheckasmFunc *const f, const int dir) {
 #define is_red(f) ((f) && !(f)->color)
 
 /* Balance a left-leaning red-black tree at the specified node */
-static void balance_tree(CheckasmFunc **root) {
+static void balance_tree(CheckasmFunc **const root) {
     CheckasmFunc *const f = *root;
 
     if (is_red(f->child[0]) && is_red(f->child[1])) {
@@ -427,12 +427,12 @@ static void balance_tree(CheckasmFunc **root) {
 }
 
 /* Get a node with the specified name, creating it if it doesn't exist */
-static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) {
+static CheckasmFunc *get_func(CheckasmFunc **const root, const char *const name) {
     CheckasmFunc *f = *root;
 
     if (f) {
         /* Search the tree for a matching node */
-        int cmp = cmp_func_names(name, f->name);
+        const int cmp = cmp_func_names(name, f->name);
         if (cmp) {
             f = get_func(&f->child[cmp > 0], name);
 
@@ -442,9 +442,9 @@ static CheckasmFunc *get_func(CheckasmFunc **root, const char *const name) {
         }
     } else {
         /* Allocate and insert a new node into the tree */
-        const size_t name_length = strlen(name);
-        f = *root = checkasm_malloc(sizeof(CheckasmFunc) + name_length);
-        memcpy(f->name, name, name_length + 1);
+        const size_t name_length = strlen(name) + 1;
+        f = *root = checkasm_malloc(offsetof(CheckasmFunc, name) + name_length);
+        memcpy(f->name, name, name_length);
     }
 
     return f;
@@ -559,28 +559,29 @@ int main(int argc, char *argv[]) {
         } else if (!strcmp(argv[1], "--verbose") || !strcmp(argv[1], "-v")) {
             state.verbose = 1;
         } else {
-            state.seed = (unsigned int) strtoul(argv[1], NULL, 10);
+            state.seed = (unsigned) strtoul(argv[1], NULL, 10);
         }
 
         argc--;
         argv++;
     }
 
-    fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
-
     dav1d_init_cpu();
+
+    if (!state.function_listing) {
+        fprintf(stderr, "checkasm: using random seed %u\n", state.seed);
 #if ARCH_X86_64
-    void checkasm_warmup_avx2(void);
-    void checkasm_warmup_avx512(void);
-    unsigned cpu_flags = dav1d_get_cpu_flags();
-    if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
-        state.simd_warmup = checkasm_warmup_avx512;
-    else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
-        state.simd_warmup = checkasm_warmup_avx2;
-    else
-        state.simd_warmup = NULL;
-    checkasm_simd_warmup();
+        void checkasm_warmup_avx2(void);
+        void checkasm_warmup_avx512(void);
+        const unsigned cpu_flags = dav1d_get_cpu_flags();
+        if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX512ICL)
+            state.simd_warmup = checkasm_warmup_avx512;
+        else if (cpu_flags & DAV1D_X86_CPU_FLAG_AVX2)
+            state.simd_warmup = checkasm_warmup_avx2;
+        checkasm_simd_warmup();
 #endif
+    }
+
     check_cpu_flag(NULL, 0);
 
     if (state.function_listing) {
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h
index c5191e242..27c28d7d2 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/checkasm.h
@@ -201,23 +201,33 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
  * those registers to keep them powered on. */
 void checkasm_simd_warmup(void);
 #define declare_new(ret, ...)\
-    ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__) =\
+    ret (*checked_call)(void *, int, int, int, int, int, __VA_ARGS__,\
+                        int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
     (void *)checkasm_checked_call;
 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
+#ifdef _WIN32
+#define STACKARGS 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0
+#else
+#define STACKARGS 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0
+#endif
 #define call_new(...)\
     (checkasm_set_signal_handler_state(1),\
      checkasm_simd_warmup(),\
      checkasm_stack_clobber(CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                             CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                             CLOB, CLOB, CLOB, CLOB, CLOB, CLOB, CLOB),\
-     checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__));\
+     checked_call(func_new, 0, 0, 0, 0, 0, __VA_ARGS__, STACKARGS));\
     checkasm_set_signal_handler_state(0)
 #elif ARCH_X86_32
 #define declare_new(ret, ...)\
-    ret (*checked_call)(void *, __VA_ARGS__) = (void *)checkasm_checked_call;
+    ret (*checked_call)(void *, __VA_ARGS__, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int, int, int) =\
+        (void *)checkasm_checked_call;
 #define call_new(...)\
     (checkasm_set_signal_handler_state(1),\
-     checked_call(func_new, __VA_ARGS__));\
+     checked_call(func_new, __VA_ARGS__, 15, 14, 13, 12,\
+                  11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1));\
     checkasm_set_signal_handler_state(0)
 #elif ARCH_ARM
 /* Use a dummy argument, to offset the real parameters by 2, not only 1.
@@ -225,17 +235,20 @@ void checkasm_simd_warmup(void);
  * the same even when the extra parameters have been removed. */
 void checkasm_checked_call_vfp(void *func, int dummy, ...);
 #define declare_new(ret, ...)\
-    ret (*checked_call)(void *, int dummy, __VA_ARGS__) =\
+    ret (*checked_call)(void *, int dummy, __VA_ARGS__,\
+                        int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
     (void *)checkasm_checked_call_vfp;
 #define call_new(...)\
     (checkasm_set_signal_handler_state(1),\
-     checked_call(func_new, 0, __VA_ARGS__));\
+     checked_call(func_new, 0, __VA_ARGS__, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0));\
     checkasm_set_signal_handler_state(0)
 #elif ARCH_AARCH64 && !defined(__APPLE__)
 void checkasm_stack_clobber(uint64_t clobber, ...);
 #define declare_new(ret, ...)\
     ret (*checked_call)(void *, int, int, int, int, int, int, int,\
-                        __VA_ARGS__) =\
+                        __VA_ARGS__, int, int, int, int, int, int, int, int,\
+                        int, int, int, int, int, int, int) =\
     (void *)checkasm_checked_call;
 #define CLOB (UINT64_C(0xdeadbeefdeadbeef))
 #define call_new(...)\
@@ -244,7 +257,8 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
                             CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                             CLOB, CLOB, CLOB, CLOB, CLOB, CLOB,\
                             CLOB, CLOB, CLOB, CLOB, CLOB),\
-     checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__));\
+     checked_call(func_new, 0, 0, 0, 0, 0, 0, 0, __VA_ARGS__,\
+                  7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0));\
     checkasm_set_signal_handler_state(0)
 #else
 #define declare_new(ret, ...)
@@ -270,8 +284,8 @@ void checkasm_stack_clobber(uint64_t clobber, ...);
             checkasm_set_signal_handler_state(1);\
             func_type *tfunc = func_new;\
             uint64_t tsum = 0;\
-            int ti, tcount = 0;\
-            for (ti = 0; ti < BENCH_RUNS; ti++) {\
+            int tcount = 0;\
+            for (int ti = 0; ti < BENCH_RUNS; ti++) {\
                 uint64_t t = readtime();\
                 tfunc(__VA_ARGS__);\
                 tfunc(__VA_ARGS__);\
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c b/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c
index 5c9e0bfc2..1219ee7c8 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/filmgrain.c
@@ -181,8 +181,8 @@ static void check_fgy_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
         const int w = 1 + (rnd() & 127);
         const int h = 1 + (rnd() & 31);
 
-        for (int y = 0; y < h; y++)
-            for (int x = 0; x < w; x++)
+        for (int y = 0; y < 32; y++)
+            for (int x = 0; x < 128; x++)
                 src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
         const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
 
@@ -260,13 +260,12 @@ static void check_fguv_sbrow(const Dav1dFilmGrainDSPContext *const dsp) {
 
                 const int w = 1 + (rnd() & (127 >> ss_x));
                 const int h = 1 + (rnd() & (31 >> ss_y));
-                const int lw = w << ss_x, lh = h << ss_y;
 
-                for (int y = 0; y < h; y++)
-                    for (int x = 0; x < w; x++)
+                for (int y = 0; y < 32; y++)
+                    for (int x = 0; x < 128; x++)
                         src[y * PXSTRIDE(stride) + x] = rnd() & bitdepth_max;
-                for (int y = 0; y < lh; y++)
-                    for (int x = 0; x < lw; x++)
+                for (int y = 0; y < 32; y++)
+                    for (int x = 0; x < 128; x++)
                         luma_src[y * PXSTRIDE(lstride) + x] = rnd() & bitdepth_max;
                 const int row_num = rnd() & 1 ? rnd() & 0x7ff : 0;
 
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c b/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c
index d5955d20e..6b054a700 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/ipred.c
@@ -75,59 +75,66 @@ static void check_intra_pred(Dav1dIntraPredDSPContext *const c) {
                  int width, int height, int angle, int max_width, int max_height
                  HIGHBD_DECL_SUFFIX);
 
-    for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++)
-        for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1)
-            if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc",
-                intra_pred_mode_names[mode], w, BITDEPTH))
-            {
-                for (int h = imax(w / 4, 4); h <= imin(w * 4,
-                    (mode == FILTER_PRED ? 32 : 64)); h <<= 1)
+    for (int mode = 0; mode < N_IMPL_INTRA_PRED_MODES; mode++) {
+        int bpc_min = BITDEPTH, bpc_max = BITDEPTH;
+        if (mode == FILTER_PRED && BITDEPTH == 16) {
+            bpc_min = 10;
+            bpc_max = 12;
+        }
+        for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2)
+            for (int w = 4; w <= (mode == FILTER_PRED ? 32 : 64); w <<= 1)
+                if (check_func(c->intra_pred[mode], "intra_pred_%s_w%d_%dbpc",
+                    intra_pred_mode_names[mode], w, bpc))
                 {
-                    const ptrdiff_t stride = w * sizeof(pixel);
-
-                    int a = 0, maxw = 0, maxh = 0;
-                    if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
-                        a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
-                            (rnd() & 0x600);
-                        if (mode == Z2_PRED) {
-                            maxw = rnd(), maxh = rnd();
-                            maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
-                            maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                    for (int h = imax(w / 4, 4); h <= imin(w * 4,
+                        (mode == FILTER_PRED ? 32 : 64)); h <<= 1)
+                    {
+                        const ptrdiff_t stride = w * sizeof(pixel);
+
+                        int a = 0, maxw = 0, maxh = 0;
+                        if (mode >= Z1_PRED && mode <= Z3_PRED) { /* angle */
+                            a = (90 * (mode - Z1_PRED) + z_angles[rnd() % 27]) |
+                                (rnd() & 0x600);
+                            if (mode == Z2_PRED) {
+                                maxw = rnd(), maxh = rnd();
+                                maxw = 1 + (maxw & (maxw & 4096 ? 4095 : w - 1));
+                                maxh = 1 + (maxh & (maxh & 4096 ? 4095 : h - 1));
+                            }
+                        } else if (mode == FILTER_PRED) /* filter_idx */
+                            a = (rnd() % 5) | (rnd() & ~511);
+
+                        int bitdepth_max;
+                        if (bpc == 16)
+                            bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+                        else
+                            bitdepth_max = (1 << bpc) - 1;
+
+                        for (int i = -h * 2; i <= w * 2; i++)
+                            topleft[i] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
+                                 HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
+                                 HIGHBD_TAIL_SUFFIX);
+                        if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
+                                                 w, h, "dst"))
+                        {
+                            if (mode == Z1_PRED || mode == Z3_PRED)
+                                fprintf(stderr, "angle = %d (0x%03x)\n",
+                                        a & 0x1ff, a & 0x600);
+                            else if (mode == Z2_PRED)
+                                fprintf(stderr, "angle = %d (0x%03x), "
+                                        "max_width = %d, max_height = %d\n",
+                                        a & 0x1ff, a & 0x600, maxw, maxh);
+                            else if (mode == FILTER_PRED)
+                                fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
                         }
-                    } else if (mode == FILTER_PRED) /* filter_idx */
-                        a = (rnd() % 5) | (rnd() & ~511);
-
-#if BITDEPTH == 16
-                    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
-                    const int bitdepth_max = 0xff;
-#endif
-
-                    for (int i = -h * 2; i <= w * 2; i++)
-                        topleft[i] = rnd() & bitdepth_max;
 
-                    call_ref(c_dst, stride, topleft, w, h, a, maxw, maxh
-                             HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, stride, topleft, w, h, a, maxw, maxh
-                             HIGHBD_TAIL_SUFFIX);
-                    if (checkasm_check_pixel(c_dst, stride, a_dst, stride,
-                                             w, h, "dst"))
-                    {
-                        if (mode == Z1_PRED || mode == Z3_PRED)
-                            fprintf(stderr, "angle = %d (0x%03x)\n",
-                                    a & 0x1ff, a & 0x600);
-                        else if (mode == Z2_PRED)
-                            fprintf(stderr, "angle = %d (0x%03x), "
-                                    "max_width = %d, max_height = %d\n",
-                                    a & 0x1ff, a & 0x600, maxw, maxh);
-                        else if (mode == FILTER_PRED)
-                            fprintf(stderr, "filter_idx = %d\n", a & 0x1ff);
+                        bench_new(a_dst, stride, topleft, w, h, a, 128, 128
+                                  HIGHBD_TAIL_SUFFIX);
                     }
-
-                    bench_new(a_dst, stride, topleft, w, h, a, 128, 128
-                              HIGHBD_TAIL_SUFFIX);
                 }
-            }
+    }
     report("intra_pred");
 }
 
@@ -142,14 +149,21 @@ static void check_cfl_ac(Dav1dIntraPredDSPContext *const c) {
     for (int layout = 1; layout <= DAV1D_PIXEL_LAYOUT_I444; layout++) {
         const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
         const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
+        const int h_step = 2 >> ss_hor, v_step = 2 >> ss_ver;
         for (int w = 4; w <= (32 >> ss_hor); w <<= 1)
             if (check_func(c->cfl_ac[layout - 1], "cfl_ac_%s_w%d_%dbpc",
                 cfl_ac_names[layout - 1], w, BITDEPTH))
             {
-                for (int h = imax(w / 4, 4); h <= imin(w * 4, (32 >> ss_ver)); h <<= 1) {
+                for (int h = imax(w / 4, 4);
+                     h <= imin(w * 4, (32 >> ss_ver)); h <<= 1)
+                {
                     const ptrdiff_t stride = 32 * sizeof(pixel);
-                    for (int w_pad = (w >> 2) - 1; w_pad >= 0; w_pad--) {
-                        for (int h_pad = (h >> 2) - 1; h_pad >= 0; h_pad--) {
+                    for (int w_pad = imax((w >> 2) - h_step, 0);
+                         w_pad >= 0; w_pad -= h_step)
+                    {
+                        for (int h_pad = imax((h >> 2) - v_step, 0);
+                             h_pad >= 0; h_pad -= v_step)
+                        {
 #if BITDEPTH == 16
                             const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
 #else
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/itx.c b/ffmpeg/JNI/dav1d/tests/checkasm/itx.c
index 9d715c8f8..01f5e0533 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/itx.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/itx.c
@@ -223,12 +223,16 @@ static int ftx(coef *const buf, const enum RectTxfmSize tx,
 }
 
 void bitfn(checkasm_check_itx)(void) {
-    Dav1dInvTxfmDSPContext c;
-    bitfn(dav1d_itx_dsp_init)(&c);
+#if BITDEPTH == 16
+    const int bpc_min = 10, bpc_max = 12;
+#else
+    const int bpc_min = 8, bpc_max = 8;
+#endif
 
     ALIGN_STK_64(coef, coeff, 2, [32 * 32]);
     ALIGN_STK_64(pixel, c_dst, 64 * 64,);
     ALIGN_STK_64(pixel, a_dst, 64 * 64,);
+    Dav1dInvTxfmDSPContext c = { { { 0 } } }; /* Zero unused function pointer elements. */
 
     static const uint8_t txfm_size_order[N_RECT_TX_SIZES] = {
         TX_4X4,   RTX_4X8,  RTX_4X16,
@@ -250,39 +254,38 @@ void bitfn(checkasm_check_itx)(void) {
         const int subsh_max = subsh_iters[imax(dav1d_txfm_dimensions[tx].lw,
                                                dav1d_txfm_dimensions[tx].lh)];
 
-        for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
-            for (int subsh = 0; subsh < subsh_max; subsh++)
-                if (check_func(c.itxfm_add[tx][txtp],
-                               "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
-                               w, h, itx_1d_names[itx_1d_types[txtp][0]],
-                               itx_1d_names[itx_1d_types[txtp][1]], subsh,
-                               BITDEPTH))
-                {
-#if BITDEPTH == 16
-                    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
-                    const int bitdepth_max = 0xff;
-#endif
-                    const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
-                    memcpy(coeff[1], coeff[0], sizeof(*coeff));
-
-                    for (int j = 0; j < w * h; j++)
-                        c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
-
-                    call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
-                             HIGHBD_TAIL_SUFFIX);
-                    call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
-                             HIGHBD_TAIL_SUFFIX);
-
-                    checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
-                                         a_dst, w * sizeof(*a_dst),
-                                         w, h, "dst");
-                    if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
-                        fail();
-
-                    bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
-                              HIGHBD_TAIL_SUFFIX);
-                }
+        for (int bpc = bpc_min; bpc <= bpc_max; bpc += 2) {
+            bitfn(dav1d_itx_dsp_init)(&c, bpc);
+            for (enum TxfmType txtp = 0; txtp < N_TX_TYPES_PLUS_LL; txtp++)
+                for (int subsh = 0; subsh < subsh_max; subsh++)
+                    if (check_func(c.itxfm_add[tx][txtp],
+                                   "inv_txfm_add_%dx%d_%s_%s_%d_%dbpc",
+                                   w, h, itx_1d_names[itx_1d_types[txtp][0]],
+                                   itx_1d_names[itx_1d_types[txtp][1]], subsh,
+                                   bpc))
+                    {
+                        const int bitdepth_max = (1 << bpc) - 1;
+                        const int eob = ftx(coeff[0], tx, txtp, w, h, subsh, bitdepth_max);
+                        memcpy(coeff[1], coeff[0], sizeof(*coeff));
+
+                        for (int j = 0; j < w * h; j++)
+                            c_dst[j] = a_dst[j] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, w * sizeof(*c_dst), coeff[0], eob
+                                 HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, w * sizeof(*c_dst), coeff[1], eob
+                                 HIGHBD_TAIL_SUFFIX);
+
+                        checkasm_check_pixel(c_dst, w * sizeof(*c_dst),
+                                             a_dst, w * sizeof(*a_dst),
+                                             w, h, "dst");
+                        if (memcmp(coeff[0], coeff[1], sizeof(*coeff)))
+                            fail();
+
+                        bench_new(a_dst, w * sizeof(*c_dst), coeff[0], eob
+                                  HIGHBD_TAIL_SUFFIX);
+                    }
+        }
         report("add_%dx%d", w, h);
     }
 }
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/mc.c b/ffmpeg/JNI/dav1d/tests/checkasm/mc.c
index e820bda88..ff8680d10 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/mc.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/mc.c
@@ -38,6 +38,7 @@ static const char *const filter_names[] = {
 };
 
 static const char *const mxy_names[] = { "0", "h", "v", "hv" };
+static const char *const scaled_paths[] = { "", "_dy1", "_dy2" };
 
 static int mc_h_next(const int h) {
     switch (h) {
@@ -161,6 +162,112 @@ static void check_mct(Dav1dMCDSPContext *const c) {
     report("mct");
 }
 
+static void check_mc_scaled(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+    ALIGN_STK_64(pixel, c_dst,   128 * 128,);
+    ALIGN_STK_64(pixel, a_dst,   128 * 128,);
+    const pixel *src = src_buf + 263 * 3 + 3;
+    const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+    const int bitdepth_max = 0xff;
+#endif
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride, const pixel *src,
+                 ptrdiff_t src_stride, int w, int h,
+                 int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 2; w <= 128; w <<= 1) {
+            const ptrdiff_t dst_stride = w * sizeof(pixel);
+            for (int p = 0; p < 3; ++p) {
+                if (check_func(c->mc_scaled[filter], "mc_scaled_%s_w%d%s_%dbpc",
+                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
+                {
+                    const int h_min = w <= 32 ? 2 : w / 4;
+                    const int h_max = imax(imin(w * 4, 128), 32);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = rnd() % 1024;
+                        const int my = rnd() % 1024;
+                        const int dx = rnd() % 2048 + 1;
+                        const int dy = !p
+                            ? rnd() % 2048 + 1
+                            : p << 10; // ystep=1.0 and ystep=2.0 paths
+
+                        for (int k = 0; k < 263 * 263; k++)
+                            src_buf[k] = rnd() & bitdepth_max;
+
+                        call_ref(c_dst, dst_stride, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        call_new(a_dst, dst_stride, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        checkasm_check_pixel(c_dst, dst_stride,
+                                             a_dst, dst_stride, w, h, "dst");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                            bench_new(a_dst, dst_stride, src, src_stride,
+                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                    }
+                }
+            }
+        }
+    report("mc_scaled");
+}
+
+static void check_mct_scaled(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+    ALIGN_STK_64(int16_t, c_tmp,   128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp,   128 * 128,);
+    const pixel *src = src_buf + 263 * 3 + 3;
+    const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+    const int bitdepth_max = 0xff;
+#endif
+
+    declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+                 int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 4; w <= 128; w <<= 1)
+            for (int p = 0; p < 3; ++p) {
+                if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
+                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
+                {
+                    const int h_min = imax(w / 4, 4);
+                    const int h_max = imin(w * 4, 128);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = rnd() % 1024;
+                        const int my = rnd() % 1024;
+                        const int dx = rnd() % 2048 + 1;
+                        const int dy = !p
+                            ? rnd() % 2048 + 1
+                            : p << 10; // ystep=1.0 and ystep=2.0 paths
+
+                        for (int k = 0; k < 263 * 263; k++)
+                            src_buf[k] = rnd() & bitdepth_max;
+
+                        call_ref(c_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        call_new(a_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
+                                                a_tmp, w * sizeof(*a_tmp),
+                                                w, h, "tmp");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                            bench_new(a_tmp, src, src_stride,
+                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                    }
+                }
+            }
+    report("mct_scaled");
+}
+
 static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
                      int16_t (*const tmp)[128 * 128], const int bitdepth_max)
 {
@@ -573,12 +680,68 @@ static void check_emuedge(Dav1dMCDSPContext *const c) {
     report("emu_edge");
 }
 
+static int get_upscale_x0(const int in_w, const int out_w, const int step) {
+    const int err = out_w * step - (in_w << 14);
+    const int x0 = (-((out_w - in_w) << 13) + (out_w >> 1)) / out_w + 128 - (err >> 1);
+    return x0 & 0x3fff;
+}
+
+static void check_resize(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, c_dst, 1024 * 64,);
+    ALIGN_STK_64(pixel, a_dst, 1024 * 64,);
+    ALIGN_STK_64(pixel, src,   512 * 64,);
+
+    const int height = 64;
+    const int max_src_width = 512;
+    const ptrdiff_t dst_stride = 1024 * sizeof(pixel);
+    const ptrdiff_t src_stride = 512 * sizeof(pixel);
+
+    declare_func(void, pixel *dst, ptrdiff_t dst_stride,
+                 const pixel *src, ptrdiff_t src_stride,
+                 int dst_w, int src_w, int h, int dx, int mx0
+                 HIGHBD_DECL_SUFFIX);
+
+    if (check_func(c->resize, "resize_%dbpc", BITDEPTH)) {
+#if BITDEPTH == 16
+        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+        const int bitdepth_max = 0xff;
+#endif
+
+        for (int i = 0; i < max_src_width * height; i++)
+            src[i] = rnd() & bitdepth_max;
+
+        const int w_den = 9 + (rnd() & 7);
+        const int src_w = 16 + (rnd() % (max_src_width - 16 + 1));
+        const int dst_w = w_den * src_w >> 3;
+#define scale_fac(ref_sz, this_sz) \
+    ((((ref_sz) << 14) + ((this_sz) >> 1)) / (this_sz))
+        const int dx = scale_fac(src_w, dst_w);
+#undef scale_fac
+        const int mx0 = get_upscale_x0(src_w, dst_w, dx);
+
+        call_ref(c_dst, dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        call_new(a_dst, dst_stride, src, src_stride,
+                 dst_w, height, src_w, dx, mx0 HIGHBD_TAIL_SUFFIX);
+        checkasm_check_pixel(c_dst, dst_stride, a_dst, dst_stride,
+                             dst_w, height, "dst");
+
+        bench_new(a_dst, dst_stride, src, src_stride,
+                  512, height, 512 * 8 / w_den, dx, mx0 HIGHBD_TAIL_SUFFIX);
+    }
+
+    report("resize");
+}
+
 void bitfn(checkasm_check_mc)(void) {
     Dav1dMCDSPContext c;
     bitfn(dav1d_mc_dsp_init)(&c);
 
     check_mc(&c);
     check_mct(&c);
+    check_mc_scaled(&c);
+    check_mct_scaled(&c);
     check_avg(&c);
     check_w_avg(&c);
     check_mask(&c);
@@ -589,4 +752,5 @@ void bitfn(checkasm_check_mc)(void) {
     check_warp8x8(&c);
     check_warp8x8t(&c);
     check_emuedge(&c);
+    check_resize(&c);
 }
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/msac.c b/ffmpeg/JNI/dav1d/tests/checkasm/msac.c
index 482d3af90..cdaf0de81 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/msac.c
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/msac.c
@@ -239,7 +239,7 @@ void checkasm_check_msac(void) {
     c.bool           = dav1d_msac_decode_bool_c;
     c.hi_tok         = dav1d_msac_decode_hi_tok_c;
 
-#if ARCH_AARCH64 && HAVE_ASM
+#if (ARCH_AARCH64 || ARCH_ARM) && HAVE_ASM
     if (dav1d_get_cpu_flags() & DAV1D_ARM_CPU_FLAG_NEON) {
         c.symbol_adapt4  = dav1d_msac_decode_symbol_adapt4_neon;
         c.symbol_adapt8  = dav1d_msac_decode_symbol_adapt8_neon;
@@ -247,6 +247,7 @@ void checkasm_check_msac(void) {
         c.bool_adapt     = dav1d_msac_decode_bool_adapt_neon;
         c.bool_equi      = dav1d_msac_decode_bool_equi_neon;
         c.bool           = dav1d_msac_decode_bool_neon;
+        c.hi_tok         = dav1d_msac_decode_hi_tok_neon;
     }
 #elif ARCH_X86 && HAVE_ASM
     if (dav1d_get_cpu_flags() & DAV1D_X86_CPU_FLAG_SSE2) {
diff --git a/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm b/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm
index e00e5bd0d..bc7ec2201 100644
--- a/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm
+++ b/ffmpeg/JNI/dav1d/tests/checkasm/x86/checkasm.asm
@@ -27,13 +27,11 @@
 %include "config.asm"
 %include "ext/x86/x86inc.asm"
 
-SECTION_RODATA
-
-error_message: db "failed to preserve register", 0
+SECTION_RODATA 16
 
 %if ARCH_X86_64
 ; just random numbers to reduce the chance of incidental match
-ALIGN 16
+%if WIN64
 x6:  dq 0x1a1b2550a612b48c,0x79445c159ce79064
 x7:  dq 0x2eed899d5a28ddcd,0x86b2536fcd8cf636
 x8:  dq 0xb0856806085e7943,0x3f2bf84fc0fcca4e
@@ -46,6 +44,7 @@ x14: dq 0x135ce6888fa02cbf,0x11e53e2b2ac655ef
 x15: dq 0x011ff554472a7a10,0x6de8f4c914c334d5
 n7:  dq 0x21f86d66c8ca00ce
 n8:  dq 0x75b6ba21077c48ad
+%endif
 n9:  dq 0xed56bb2dcb3c7736
 n10: dq 0x8bda43d3fd1a7e06
 n11: dq 0xb64a9c9e5d318408
@@ -54,6 +53,9 @@ n13: dq 0x4a75479abd64e097
 n14: dq 0x249214109d5d1c88
 %endif
 
+errmsg_reg:   db "failed to preserve register", 0
+errmsg_stack: db "stack corruption", 0
+
 SECTION .text
 
 cextern fail_func
@@ -67,7 +69,7 @@ cextern fail_func
 ;-----------------------------------------------------------------------------
 ; int checkasm_stack_clobber(uint64_t clobber, ...)
 ;-----------------------------------------------------------------------------
-cglobal stack_clobber, 1,2
+cglobal stack_clobber, 1, 2
     ; Clobber the stack with junk below the stack pointer
     %define argsize (max_args+6)*8
     SUB  rsp, argsize
@@ -81,9 +83,13 @@ cglobal stack_clobber, 1,2
 
 %if WIN64
     %assign free_regs 7
+    %define stack_param rsp+32 ; shadow space
+    %define num_stack_params rsp+stack_offset+22*8
     DECLARE_REG_TMP 4
 %else
     %assign free_regs 9
+    %define stack_param rsp
+    %define num_stack_params rsp+stack_offset+16*8
     DECLARE_REG_TMP 7
 %endif
 
@@ -91,7 +97,7 @@ cglobal stack_clobber, 1,2
 ; void checkasm_checked_call(void *func, ...)
 ;-----------------------------------------------------------------------------
 INIT_XMM
-cglobal checked_call, 2,15,16,max_args*8+8
+cglobal checked_call, 2, 15, 16, max_args*8+64+8
     mov  t0, r0
 
     ; All arguments have been pushed on the stack instead of registers in
@@ -104,20 +110,7 @@ cglobal checked_call, 2,15,16,max_args*8+8
 %if UNIX64
     mov  r4, r10mp
     mov  r5, r11mp
-    %assign i 6
-    %rep max_args-6
-        mov  r9, [rsp+stack_offset+(i+1)*8]
-        mov  [rsp+(i-6)*8], r9
-        %assign i i+1
-    %endrep
 %else ; WIN64
-    %assign i 4
-    %rep max_args-4
-        mov  r9, [rsp+stack_offset+(i+7)*8]
-        mov  [rsp+i*8], r9
-        %assign i i+1
-    %endrep
-
     ; Move possible floating-point arguments to the correct registers
     movq m0, r0
     movq m1, r1
@@ -131,22 +124,44 @@ cglobal checked_call, 2,15,16,max_args*8+8
     %endrep
 %endif
 
+    ; write stack canaries to the area above parameters passed on the stack
+    mov r9d, [num_stack_params]
+    mov  r8, [rsp+stack_offset] ; return address
+    not  r8
+%assign i 0
+%rep 8 ; 64 bytes
+    mov [stack_param+(r9+i)*8], r8
+    %assign i i+1
+%endrep
+    dec r9d
+    jl .stack_setup_done ; no stack parameters
+.copy_stack_parameter:
+    mov  r8, [stack_param+stack_offset+7*8+r9*8]
+    mov [stack_param+r9*8], r8
+    dec r9d
+    jge .copy_stack_parameter
+.stack_setup_done:
+
 %assign i 14
 %rep 15-free_regs
     mov r %+ i, [n %+ i]
     %assign i i-1
 %endrep
     call t0
-%assign i 14
-%rep 15-free_regs
+
+    ; check for failure to preserve registers
+    xor r14, [n14]
+    lea  r0, [errmsg_reg]
+%assign i 13
+%rep 14-free_regs
     xor r %+ i, [n %+ i]
     or  r14, r %+ i
     %assign i i-1
 %endrep
-
 %if WIN64
-    %assign i 6
-    %rep 16-6
+    pxor m6, [x6]
+    %assign i 7
+    %rep 16-7
         pxor m %+ i, [x %+ i]
         por  m6, m %+ i
         %assign i i+1
@@ -155,14 +170,30 @@ cglobal checked_call, 2,15,16,max_args*8+8
     movq r5, m6
     or  r14, r5
 %endif
+    jnz .fail
 
-    ; Call fail_func() with a descriptive message to mark it as a failure
-    ; if the called function didn't preserve all callee-saved registers.
-    ; Save the return value located in rdx:rax first to prevent clobbering.
+    ; check for stack corruption
+    mov r9d, [num_stack_params]
+    mov  r8, [rsp+stack_offset]
+    mov  r4, [stack_param+r9*8]
+    not  r8
+    xor  r4, r8
+%assign i 1
+%rep 6
+    mov  r5, [stack_param+(r9+i)*8]
+    xor  r5, r8
+    or   r4, r5
+    %assign i i+1
+%endrep
+    xor  r8, [stack_param+(r9+7)*8]
+    or   r4, r8
     jz .ok
+    add  r0, errmsg_stack-errmsg_reg
+.fail:
+    ; Call fail_func() with a descriptive message to mark it as a failure.
+    ; Save the return value located in rdx:rax first to prevent clobbering.
     mov  r9, rax
     mov r10, rdx
-    lea  r0, [error_message]
     xor eax, eax
     call fail_func
     mov rdx, r10
@@ -186,40 +217,70 @@ WARMUP
 %else
 
 ; just random numbers to reduce the chance of incidental match
-%define n3 dword 0x6549315c
-%define n4 dword 0xe02f3e23
-%define n5 dword 0xb78d0d1d
-%define n6 dword 0x33627ba7
+%assign n3 0x6549315c
+%assign n4 0xe02f3e23
+%assign n5 0xb78d0d1d
+%assign n6 0x33627ba7
 
 ;-----------------------------------------------------------------------------
 ; void checkasm_checked_call(void *func, ...)
 ;-----------------------------------------------------------------------------
-cglobal checked_call, 1,7
+cglobal checked_call, 1, 7
+    mov  r3, [esp+stack_offset]      ; return address
+    mov  r1, [esp+stack_offset+17*4] ; num_stack_params
+    mov  r2, 27
+    not  r3
+    sub  r2, r1
+.push_canary:
+    push r3
+    dec  r2
+    jg .push_canary
+.push_parameter:
+    push dword [esp+32*4]
+    dec  r1
+    jg .push_parameter
     mov  r3, n3
     mov  r4, n4
     mov  r5, n5
     mov  r6, n6
-%rep max_args
-    PUSH dword [esp+20+max_args*4]
-%endrep
     call r0
+
+    ; check for failure to preserve registers
     xor  r3, n3
     xor  r4, n4
     xor  r5, n5
     xor  r6, n6
     or   r3, r4
     or   r5, r6
+    LEA  r1, errmsg_reg
     or   r3, r5
+    jnz .fail
+
+    ; check for stack corruption
+    mov  r3, [esp+48*4] ; num_stack_params
+    mov  r6, [esp+31*4] ; return address
+    mov  r4, [esp+r3*4]
+    sub  r3, 26
+    not  r6
+    xor  r4, r6
+.check_canary:
+    mov  r5, [esp+(r3+27)*4]
+    xor  r5, r6
+    or   r4, r5
+    inc  r3
+    jl .check_canary
+    test r4, r4
     jz .ok
+    add  r1, errmsg_stack-errmsg_reg
+.fail:
     mov  r3, eax
     mov  r4, edx
-    LEA  r0, error_message
-    mov [esp], r0
+    mov [esp], r1
     call fail_func
-    mov  edx, r4
-    mov  eax, r3
+    mov edx, r4
+    mov eax, r3
 .ok:
-    add  esp, max_args*4
+    add esp, 27*4
     RET
 
 %endif ; ARCH_X86_64
diff --git a/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c b/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c
index 9d8b3852a..4506d2f9f 100644
--- a/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c
+++ b/ffmpeg/JNI/dav1d/tests/libfuzzer/dav1d_fuzzer.c
@@ -69,13 +69,6 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
 
     dav1d_version();
 
-    // memory sanitizer is inherently incompatible with asm
-#if defined(__has_feature)
-  #if __has_feature(memory_sanitizer)
-    dav1d_set_cpu_flags_mask(0);
-  #endif
-#endif
-
     if (size < 32) goto end;
 #ifdef DAV1D_ALLOC_FAIL
     unsigned h = djb_xor(ptr, 32);
diff --git a/ffmpeg/JNI/dav1d/tools/dav1d.c b/ffmpeg/JNI/dav1d/tools/dav1d.c
index 97c780146..4b97a9f20 100644
--- a/ffmpeg/JNI/dav1d/tools/dav1d.c
+++ b/ffmpeg/JNI/dav1d/tools/dav1d.c
@@ -63,7 +63,9 @@ static uint64_t get_time_nanos(void) {
     QueryPerformanceFrequency(&frequency);
     LARGE_INTEGER t;
     QueryPerformanceCounter(&t);
-    return 1000000000 * t.QuadPart / frequency.QuadPart;
+    uint64_t seconds = t.QuadPart / frequency.QuadPart;
+    uint64_t fractions = t.QuadPart % frequency.QuadPart;
+    return 1000000000 * seconds + 1000000000 * fractions / frequency.QuadPart;
 #elif defined(HAVE_CLOCK_GETTIME)
     struct timespec ts;
     clock_gettime(CLOCK_MONOTONIC, &ts);
@@ -245,7 +247,7 @@ int main(const int argc, char *const *const argv) {
             if ((res = output_write(out, &p)) < 0)
                 break;
             n_out++;
-            if (nspf) {
+            if (nspf || !cli_settings.quiet) {
                 synchronize(cli_settings.realtime, cli_settings.realtime_cache,
                             n_out, nspf, tfirst, &elapsed, frametimes);
             }
@@ -282,7 +284,7 @@ int main(const int argc, char *const *const argv) {
             if ((res = output_write(out, &p)) < 0)
                 break;
             n_out++;
-            if (nspf) {
+            if (nspf || !cli_settings.quiet) {
                 synchronize(cli_settings.realtime, cli_settings.realtime_cache,
                             n_out, nspf, tfirst, &elapsed, frametimes);
             }
diff --git a/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c b/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c
index 98b425317..f363033ed 100644
--- a/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c
+++ b/ffmpeg/JNI/dav1d/tools/dav1d_cli_parse.c
@@ -84,6 +84,8 @@ static const struct option long_opts[] = {
 
 #if ARCH_AARCH64 || ARCH_ARM
 #define ALLOWED_CPU_MASKS " or 'neon'"
+#elif ARCH_PPC64LE
+#define ALLOWED_CPU_MASKS " or 'vsx'"
 #elif ARCH_X86
 #define ALLOWED_CPU_MASKS \
     ", 'sse2', 'ssse3', 'sse41', 'avx2' or 'avx512icl'"
@@ -116,7 +118,7 @@ static void usage(const char *const app, const char *const reason, ...) {
             " --framethreads $num:  number of frame threads (default: 1)\n"
             " --tilethreads $num:   number of tile threads (default: 1)\n"
             " --filmgrain $num:     enable film grain application (default: 1, except if muxer is md5)\n"
-            " --oppoint $num:       select an operating point of a scalable AV1 bitstream (0 - 32)\n"
+            " --oppoint $num:       select an operating point of a scalable AV1 bitstream (0 - 31)\n"
             " --alllayers $num:     output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
             " --sizelimit $num:     stop decoding if the frame size exceeds the specified limit\n"
             " --verify $md5:        verify decoded md5. implies --muxer md5, no output\n"
@@ -187,6 +189,8 @@ enum CpuMask {
 static const EnumParseTable cpu_mask_tbl[] = {
 #if ARCH_AARCH64 || ARCH_ARM
     { "neon", DAV1D_ARM_CPU_FLAG_NEON },
+#elif ARCH_PPC64LE
+    { "vsx", DAV1D_PPC_CPU_FLAG_VSX },
 #elif ARCH_X86
     { "sse2",      X86_CPU_MASK_SSE2 },
     { "ssse3",     X86_CPU_MASK_SSSE3 },
diff --git a/ffmpeg/JNI/dav1d/tools/input/input.c b/ffmpeg/JNI/dav1d/tools/input/input.c
index d8a56c182..3ed6983ac 100644
--- a/ffmpeg/JNI/dav1d/tools/input/input.c
+++ b/ffmpeg/JNI/dav1d/tools/input/input.c
@@ -82,6 +82,10 @@ int input_open(DemuxerContext **const c_out,
             return DAV1D_ERR(ENOMEM);
         }
         FILE *f = fopen(filename, "rb");
+        if (!f) {
+            fprintf(stderr, "Failed to open input file %s: %s\n", filename, strerror(errno));
+            return errno ? DAV1D_ERR(errno) : DAV1D_ERR(EIO);
+        }
         res = !!fread(probe_data, 1, probe_sz, f);
         fclose(f);
         if (!res) {
diff --git a/ffmpeg/JNI/dav1d/tools/input/ivf.c b/ffmpeg/JNI/dav1d/tools/input/ivf.c
index 746391d4c..7b572ee73 100644
--- a/ffmpeg/JNI/dav1d/tools/input/ivf.c
+++ b/ffmpeg/JNI/dav1d/tools/input/ivf.c
@@ -28,6 +28,7 @@
 #include "config.h"
 
 #include <errno.h>
+#include <limits.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -92,8 +93,27 @@ static int ivf_open(IvfInputContext *const c, const char *const file,
             break; // EOF
         fseeko(c->f, rl32(data) + 8, SEEK_CUR);
     }
-    fps[0] = timebase[0] * *num_frames;
-    fps[1] = timebase[1] * duration;
+
+    uint64_t fps_num = (uint64_t) timebase[0] * *num_frames;
+    uint64_t fps_den = (uint64_t) timebase[1] * duration;
+    if (fps_num && fps_den) { /* Reduce fraction */
+        uint64_t gcd = fps_num;
+        for (uint64_t a = fps_den, b; (b = a % gcd); a = gcd, gcd = b);
+        fps_num /= gcd;
+        fps_den /= gcd;
+
+        while ((fps_num | fps_den) > UINT_MAX) {
+            fps_num >>= 1;
+            fps_den >>= 1;
+        }
+    }
+    if (fps_num && fps_den) {
+        fps[0] = (unsigned) fps_num;
+        fps[1] = (unsigned) fps_den;
+    } else {
+        fps[0] = fps[1] = 0;
+    }
+
     fseeko(c->f, 32, SEEK_SET);
 
     return 0;