From 0d5c8e53861adbb54ee9952ed76c2e8dd029ca96 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:43:13 +0530 Subject: [PATCH 01/16] arm: Determine the abi from compiler if not specified on command line If ARM abi is not explicitly mentioned on the command line, then set the arm abi to softfp or hard according to the compiler environment. This assumes that compiler sets the defines __ARM_PCS and __ARM_PCS_VFP accordingly. --- Makefile.arm | 23 +++-------------------- Makefile.system | 24 ++++++++++++++++-------- c_check | 16 +++++++++++++++- common_arm.h | 5 ----- 4 files changed, 34 insertions(+), 34 deletions(-) diff --git a/Makefile.arm b/Makefile.arm index c189b0c479..eedd39b736 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,5 +1,4 @@ -#ifeq logical or -ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(OSNAME), Android) CCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a @@ -9,28 +8,12 @@ FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a endif endif -ifeq ($(CORE), ARMV7) -ifeq ($(OSNAME), Android) -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfpu=neon -march=armv7-a -FCOMMON_OPT += -mfpu=neon -march=armv7-a -else -CCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -FCOMMON_OPT += -mfpu=neon -march=armv7-a -Wl,--no-warn-mismatch -endif -else -CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a -endif -endif - ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp -march=armv6 endif - ifeq ($(CORE), ARMV5) -CCOMMON_OPT += -marm -march=armv5 -FCOMMON_OPT += -marm -march=armv5 +CCOMMON_OPT += -march=armv5 +FCOMMON_OPT += -march=armv5 endif diff --git a/Makefile.system b/Makefile.system index 29d3efd53b..2cae5f1c9d 100644 --- a/Makefile.system +++ b/Makefile.system @@ -242,6 +242,10 @@ EXTRALIB += -lm NO_EXPRECISION = 1 endif +ifeq ($(OSNAME), Android) +EXTRALIB += -lm +endif + ifeq ($(OSNAME), AIX) EXTRALIB += -lm endif @@ -483,16 +487,20 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -CCOMMON_OPT += -marm -FCOMMON_OPT += -marm - -ifeq ($(ARM_SOFTFP_ABI), 1) -CCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI -FCOMMON_OPT += -mfloat-abi=softfp -DARM_SOFTFP_ABI +# If ABI is specified on command line use it. Else use the automatically detected ABI. +ifeq ($(ARM_SOFTFP_ABI),1) +ARM_ABI = softfp else -CCOMMON_OPT += -mfloat-abi=hard -FCOMMON_OPT += -mfloat-abi=hard +ifeq ($(ARM_HARD_ABI),1) +ARM_ABI = hard +else +ARM_ABI=$(ARM_ABI_AUTO) +endif endif +export ARM_ABI_AUTO +CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) +FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) + endif ifeq ($(ARCH), arm64) diff --git a/c_check b/c_check index 20da288bec..2e7e08cfba 100644 --- a/c_check +++ b/c_check @@ -94,7 +94,17 @@ if ($architecture eq "mips64") { $defined = 1; } -if (($architecture eq "arm") || ($architecture eq "arm64")) { +if ($architecture eq "arm") { + $defined = 1; + $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; + if ($data ne "") { + $abi = "hard"; + } else { + $abi = "softfp"; + } +} + +if ($architecture eq "arm64") { $defined = 1; } @@ -287,6 +297,10 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; +if ($architecture eq "arm") { + print MAKEFILE "ARM_ABI_AUTO=$abi\n"; +} + $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/common_arm.h b/common_arm.h index a17acb4487..27fa76b765 100644 --- a/common_arm.h +++ b/common_arm.h @@ -111,11 +111,6 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define PROFCODE -#ifdef __ARM_PCS -//-mfloat-abi=softfp -#define SOFT_FLOAT_ABI -#endif - #endif From da7f0ff425a4ccd9a4f32b7fff33b9ef807ad0f4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 12:46:18 +0530 Subject: [PATCH 02/16] generic: add some generic gemm and trmm kernels Added generic 4x4 and 4x2 gemm kernels Added generic 4x2 trmm kernel --- kernel/generic/gemmkernel_4x2.c | 317 ++++++++++++++++++ kernel/generic/gemmkernel_4x4.c | 571 ++++++++++++++++++++++++++++++++ kernel/generic/trmmkernel_4x2.c | 528 +++++++++++++++++++++++++++++ 3 files changed, 1416 insertions(+) create mode 100644 kernel/generic/gemmkernel_4x2.c create mode 100644 kernel/generic/gemmkernel_4x4.c create mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c new file mode 100644 index 0000000000..1d15de1d7b --- /dev/null +++ b/kernel/generic/gemmkernel_4x2.c @@ -0,0 +1,317 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + for (j=0; j<(bn/2); j+=2) + { + C0 = C; + C1 = C0+ldc; + + ptrba = ba; + + for (i=0; i + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + + for (j=0; j + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + + BLASLONG off, temp; + + bool left; + bool transposed; + bool backwards; + +#ifdef LEFT + left = true; +#else + left = false; +#endif + +#ifdef TRANSA + transposed = true; +#else + transposed = false; +#endif + + backwards = left != transposed; + + if (!left) { + off = -offset; + } + + for (j=0; j<(bn/2); j+=2) // do the Mx2 loops + { + C0 = C; + C1 = C0+ldc; + +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + + + ptrba = ba; + + for (i=0; i Date: Fri, 30 Jun 2017 13:06:38 +0530 Subject: [PATCH 03/16] arm: Use assembly implementations based on the ARM abi In case of softfp abi, assembly implementations of only those APIs are used which doesnt have a floating point argument or return value. In case of hard abi, all assembly implementations are used. --- kernel/arm/KERNEL.ARMV6 | 112 +++++++++++++++-------------------- kernel/arm/KERNEL.ARMV7 | 125 +++++----------------------------------- 2 files changed, 60 insertions(+), 177 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 16bde105b3..a2dd4806de 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,7 +1,5 @@ +include $(KERNELDIR)/KERNEL.ARMV5 - - -############################################################################### SAMAXKERNEL = iamax_vfp.S DAMAXKERNEL = iamax_vfp.S CAMAXKERNEL = iamax_vfp.S @@ -34,6 +32,45 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = sgemm_ncopy_4_vfp.S +SGEMMITCOPY = sgemm_tcopy_4_vfp.S +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +endif +SGEMMONCOPY = sgemm_ncopy_2_vfp.S +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = ../generic/gemmkernel_4x2.c +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) +DGEMMINCOPY = dgemm_ncopy_4_vfp.S +DGEMMITCOPY = dgemm_tcopy_4_vfp.S +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +endif +DGEMMONCOPY = dgemm_ncopy_2_vfp.S +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +STRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = ../generic/trmmkernel_4x2.c + +CGEMMONCOPY = cgemm_ncopy_2_vfp.S +CGEMMOTCOPY = cgemm_tcopy_2_vfp.S +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMONCOPY = zgemm_ncopy_2_vfp.S +ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +ifeq ($(ARM_ABI),hard) + SASUMKERNEL = asum_vfp.S DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S @@ -44,11 +81,6 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S @@ -64,11 +96,6 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S @@ -84,63 +111,14 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S -ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S SGEMMKERNEL = sgemm_kernel_4x2_vfp.S -SGEMMINCOPY = sgemm_ncopy_4_vfp.S -SGEMMITCOPY = sgemm_tcopy_4_vfp.S -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPY = sgemm_ncopy_2_vfp.S -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x2_vfp.S -DGEMMINCOPY = dgemm_ncopy_4_vfp.S -DGEMMITCOPY = dgemm_tcopy_4_vfp.S -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPY = dgemm_ncopy_2_vfp.S -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - +endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d5cd94fbdb..d4829faa32 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,86 +1,29 @@ +include $(KERNELDIR)/KERNEL.ARMV6 -################################################################################# -SAMAXKERNEL = iamax_vfp.S -DAMAXKERNEL = iamax_vfp.S -CAMAXKERNEL = iamax_vfp.S -ZAMAXKERNEL = iamax_vfp.S +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SAMINKERNEL = iamax_vfp.S -DAMINKERNEL = iamax_vfp.S -CAMINKERNEL = iamax_vfp.S -ZAMINKERNEL = iamax_vfp.S - -SMAXKERNEL = iamax_vfp.S -DMAXKERNEL = iamax_vfp.S - -SMINKERNEL = iamax_vfp.S -DMINKERNEL = iamax_vfp.S - -ISAMAXKERNEL = iamax_vfp.S -IDAMAXKERNEL = iamax_vfp.S -ICAMAXKERNEL = iamax_vfp.S -IZAMAXKERNEL = iamax_vfp.S - -ISAMINKERNEL = iamax_vfp.S -IDAMINKERNEL = iamax_vfp.S -ICAMINKERNEL = iamax_vfp.S -IZAMINKERNEL = iamax_vfp.S - -ISMAXKERNEL = iamax_vfp.S -IDMAXKERNEL = iamax_vfp.S - -ISMINKERNEL = iamax_vfp.S -IDMINKERNEL = iamax_vfp.S - -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S +SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMONCOPY = sgemm_ncopy_4_vfp.S +SGEMMOTCOPY = sgemm_tcopy_4_vfp.S +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o -SCOPYKERNEL = copy.c -DCOPYKERNEL = copy.c -CCOPYKERNEL = zcopy.c -ZCOPYKERNEL = zcopy.c +DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMONCOPY = dgemm_ncopy_4_vfp.S +DGEMMOTCOPY = dgemm_tcopy_4_vfp.S +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S +ifeq ($(ARM_ABI),hard) SNRM2KERNEL = nrm2_vfpv3.S DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S @@ -88,47 +31,9 @@ CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S -SGEMMONCOPY = sgemm_ncopy_4_vfp.S -SGEMMOTCOPY = sgemm_tcopy_4_vfp.S -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S -DGEMMONCOPY = dgemm_ncopy_4_vfp.S -DGEMMOTCOPY = dgemm_tcopy_4_vfp.S -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S -CGEMMONCOPY = cgemm_ncopy_2_vfp.S -CGEMMOTCOPY = cgemm_tcopy_2_vfp.S -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ZGEMMONCOPY = zgemm_ncopy_2_vfp.S -ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - +endif From aa5edebc80200a590362dc2229e2751d399c04aa Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 13:12:05 +0530 Subject: [PATCH 04/16] arm: add softfp support in kernel/arm/asum_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/asum_vfp.S | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index a2dd4806de..1da6e4e1fd 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -32,6 +32,11 @@ IDMAXKERNEL = iamax_vfp.S ISMINKERNEL = iamax_vfp.S IDMINKERNEL = iamax_vfp.S +SASUMKERNEL = asum_vfp.S +DASUMKERNEL = asum_vfp.S +CASUMKERNEL = asum_vfp.S +ZASUMKERNEL = asum_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -71,11 +76,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SASUMKERNEL = asum_vfp.S -DASUMKERNEL = asum_vfp.S -CASUMKERNEL = asum_vfp.S -ZASUMKERNEL = asum_vfp.S - SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/asum_vfp.S b/kernel/arm/asum_vfp.S index fe6242a5b8..5b08e5028c 100644 --- a/kernel/arm/asum_vfp.S +++ b/kernel/arm/asum_vfp.S @@ -475,6 +475,14 @@ asum_kernel_L999: vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif +#endif + bx lr EPILOGUE From 4f0773f07d07b9adad103e66d7b3abae108d9d31 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 20:06:29 +0530 Subject: [PATCH 05/16] arm: add softfp support in kernel/arm/axpy_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++--- kernel/arm/axpy_vfp.S | 71 ++++++++++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 1da6e4e1fd..63867d2b79 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -37,6 +37,11 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S +SAXPYKERNEL = axpy_vfp.S +DAXPYKERNEL = axpy_vfp.S +CAXPYKERNEL = axpy_vfp.S +ZAXPYKERNEL = axpy_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -76,11 +81,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SAXPYKERNEL = axpy_vfp.S -DAXPYKERNEL = axpy_vfp.S -CAXPYKERNEL = axpy_vfp.S -ZAXPYKERNEL = axpy_vfp.S - SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S diff --git a/kernel/arm/axpy_vfp.S b/kernel/arm/axpy_vfp.S index 8e5334f62c..a407b04bd3 100644 --- a/kernel/arm/axpy_vfp.S +++ b/kernel/arm/axpy_vfp.S @@ -38,18 +38,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_INC_X [fp, #0 ] -#define OLD_Y [fp, #4 ] -#define OLD_INC_Y [fp, #8 ] -#else +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 #define OLD_X [fp, #0 ] #define OLD_INC_X [fp, #4 ] #define OLD_Y [fp, #8 ] #define OLD_INC_Y [fp, #12 ] +#else +#define OLD_ALPHA [fp, #0] +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define OLD_Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] #endif - + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_ALPHAR [fp, #0] +#define OLD_ALPHAI [fp, #8] +#define OLD_X [fp, #16 ] +#define OLD_INC_X [fp, #20 ] +#define OLD_Y [fp, #24 ] +#define OLD_INC_Y [fp, #28 ] +#endif + +#endif //!defined(COMPLEX) + +#else //__ARM_PCS_VFP + +#define OLD_INC_X [fp, #0 ] +#define OLD_Y [fp, #4 ] +#define OLD_INC_Y [fp, #8 ] + +#endif //!defined(__ARM_PCS_VFP) + #define N r0 #define Y r1 #define INC_X r2 @@ -370,13 +404,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 sub sp, sp, #STACKSIZE // reserve stack -#ifdef ARM_SOFTFP_ABI -#ifndef DOUBLE - vmov s0, r3 //move alpha to s0 +#if !defined(__ARM_PCS_VFP) +#if !defined(COMPLEX) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA ldr X, OLD_X +#else + vldr d0, OLD_ALPHA + ldr X, OLD_X +#endif +#else //COMPLEX +#if !defined(DOUBLE) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr X, OLD_X +#else + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr X, OLD_X +#endif #endif #endif - + ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From 0150fabdb6250748bc45d18ccbb782331526c5cd Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 21:52:32 +0530 Subject: [PATCH 06/16] arm: add softfp support in kernel/arm/rot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/rot_vfp.S | 19 ++++++++++++++++++- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 63867d2b79..e9fe6bedd4 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -42,6 +42,11 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S +SROTKERNEL = rot_vfp.S +DROTKERNEL = rot_vfp.S +CROTKERNEL = rot_vfp.S +ZROTKERNEL = rot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S -SROTKERNEL = rot_vfp.S -DROTKERNEL = rot_vfp.S -CROTKERNEL = rot_vfp.S -ZROTKERNEL = rot_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/rot_vfp.S b/kernel/arm/rot_vfp.S index d053423b66..6e679ecf97 100644 --- a/kernel/arm/rot_vfp.S +++ b/kernel/arm/rot_vfp.S @@ -40,6 +40,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_INC_Y [fp, #0 ] +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) +#define OLD_C [fp, #4] +#define OLD_S [fp, #8] +#else +#define OLD_C [fp, #8] +#define OLD_S [fp, #16] +#endif +#endif #define N r0 #define X r1 @@ -462,7 +471,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #8 ldr INC_Y , OLD_INC_Y - +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vldr s0, OLD_C + vldr s1, OLD_S +#else + vldr d0, OLD_C + vldr d1, OLD_S +#endif +#endif cmp N, #0 ble rot_kernel_L999 From 54915ce343dba7e00dba21c73b9fd35bcded0de3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Fri, 30 Jun 2017 23:46:02 +0530 Subject: [PATCH 07/16] arm: add softfp support in kernel/arm/*dot_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/cdot_vfp.S | 32 ++++++++++++++++++++++++++------ kernel/arm/ddot_vfp.S | 3 +++ kernel/arm/sdot_vfp.S | 13 ++++++------- kernel/arm/zdot_vfp.S | 32 ++++++++++++++++++++++++++------ 5 files changed, 66 insertions(+), 24 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e9fe6bedd4..022547c9b5 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -47,6 +47,11 @@ DROTKERNEL = rot_vfp.S CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S +SDOTKERNEL = sdot_vfp.S +DDOTKERNEL = ddot_vfp.S +CDOTKERNEL = cdot_vfp.S +ZDOTKERNEL = zdot_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -86,11 +91,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SDOTKERNEL = sdot_vfp.S -DDOTKERNEL = ddot_vfp.S -CDOTKERNEL = cdot_vfp.S -ZDOTKERNEL = zdot_vfp.S - SNRM2KERNEL = nrm2_vfp.S DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S diff --git a/kernel/arm/cdot_vfp.S b/kernel/arm/cdot_vfp.S index 0497b6d83e..e5a6e4d35a 100644 --- a/kernel/arm/cdot_vfp.S +++ b/kernel/arm/cdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -179,7 +188,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -191,8 +200,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vmov s2, s0 vmov s3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y +#endif cmp N, #0 ble cdot_kernel_L999 @@ -265,7 +283,6 @@ cdot_kernel_S10: cdot_kernel_L999: - sub r3, fp, #128 vldm r3, { s8 - s15} // restore floating point registers @@ -276,8 +293,11 @@ cdot_kernel_L999: vadd.f32 s0 , s0, s2 vsub.f32 s1 , s1, s3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {s0 - s1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/ddot_vfp.S b/kernel/arm/ddot_vfp.S index f28acbae3a..fb294d8b46 100644 --- a/kernel/arm/ddot_vfp.S +++ b/kernel/arm/ddot_vfp.S @@ -246,6 +246,9 @@ ddot_kernel_L999: vldm r3, { d8 - d15} // restore floating point registers vadd.f64 d0 , d0, d1 // set return value +#if !defined(__ARM_PCS_VFP) + vmov r0, r1, d0 +#endif sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/sdot_vfp.S b/kernel/arm/sdot_vfp.S index f3abdc197e..5f4f424bfc 100644 --- a/kernel/arm/sdot_vfp.S +++ b/kernel/arm/sdot_vfp.S @@ -329,20 +329,19 @@ sdot_kernel_L999: vldm r3, { s8 - s15} // restore floating point registers #if defined(DSDOT) - vadd.f64 d0 , d0, d1 // set return value - -#ifdef ARM_SOFTFP_ABI - vmov r0, r1, d0 +#else + vadd.f32 s0 , s0, s1 // set return value #endif +#if !defined(__ARM_PCS_VFP) +#if defined(DSDOT) + vmov r0, r1, d0 #else - - vadd.f32 s0 , s0, s1 // set return value -#ifdef ARM_SOFTFP_ABI vmov r0, s0 #endif #endif + sub sp, fp, #24 pop {r4 - r9, fp} bx lr diff --git a/kernel/arm/zdot_vfp.S b/kernel/arm/zdot_vfp.S index 936ce9f60f..43f2c0c0bf 100644 --- a/kernel/arm/zdot_vfp.S +++ b/kernel/arm/zdot_vfp.S @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r0 #define X r1 #define INC_X r2 -#define OLD_Y r3 - /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -50,7 +48,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * registers *******************************************************/ -#define OLD_INC_Y [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_RETURN_ADDR r0 +#define OLD_N r1 +#define OLD_X r2 +#define OLD_INC_X r3 +#define OLD_Y [fp, #0 ] +#define OLD_INC_Y [fp, #4 ] +#define RETURN_ADDR r8 +#else +#define OLD_Y r3 +#define OLD_INC_Y [fp, #0 ] +#endif #define I r5 #define Y r6 @@ -181,7 +190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .align 5 push {r4 - r9, fp} - add fp, sp, #24 + add fp, sp, #28 sub sp, sp, #STACKSIZE // reserve stack sub r4, fp, #128 @@ -194,9 +203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vcvt.f64.f32 d2, s0 vcvt.f64.f32 d3, s0 +#if !defined(__ARM_PCS_VFP) + mov RETURN_ADDR, OLD_RETURN_ADDR + mov N, OLD_N + mov X, OLD_X + mov INC_X, OLD_INC_X + ldr Y, OLD_Y + ldr INC_Y, OLD_INC_Y +#else mov Y, OLD_Y ldr INC_Y, OLD_INC_Y - +#endif cmp N, #0 ble zdot_kernel_L999 @@ -280,8 +297,11 @@ zdot_kernel_L999: vadd.f64 d0 , d0, d2 vsub.f64 d1 , d1, d3 #endif +#if !defined(__ARM_PCS_VFP) + vstm RETURN_ADDR, {d0 - d1} +#endif - sub sp, fp, #24 + sub sp, fp, #28 pop {r4 - r9, fp} bx lr From e25f4c01d60b76ab97252e475abfb8fa7e65c0f9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 19:57:28 +0530 Subject: [PATCH 08/16] arm: add softfp support in kernel/arm/nrm2_vfp*.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/KERNEL.ARMV7 | 10 +++++----- kernel/arm/nrm2_vfp.S | 7 +++++++ kernel/arm/nrm2_vfpv3.S | 9 +++++++-- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022547c9b5..be51e83b8b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -52,6 +52,11 @@ DDOTKERNEL = ddot_vfp.S CDOTKERNEL = cdot_vfp.S ZDOTKERNEL = zdot_vfp.S +SNRM2KERNEL = nrm2_vfp.S +DNRM2KERNEL = nrm2_vfp.S +CNRM2KERNEL = nrm2_vfp.S +ZNRM2KERNEL = nrm2_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -91,11 +96,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfp.S -DNRM2KERNEL = nrm2_vfp.S -CNRM2KERNEL = nrm2_vfp.S -ZNRM2KERNEL = nrm2_vfp.S - SSWAPKERNEL = swap_vfp.S DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index d4829faa32..f4823b70ab 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -1,5 +1,10 @@ include $(KERNELDIR)/KERNEL.ARMV6 +SNRM2KERNEL = nrm2_vfpv3.S +DNRM2KERNEL = nrm2_vfpv3.S +CNRM2KERNEL = nrm2_vfpv3.S +ZNRM2KERNEL = nrm2_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -17,11 +22,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SNRM2KERNEL = nrm2_vfpv3.S -DNRM2KERNEL = nrm2_vfpv3.S -CNRM2KERNEL = nrm2_vfpv3.S -ZNRM2KERNEL = nrm2_vfpv3.S - SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S diff --git a/kernel/arm/nrm2_vfp.S b/kernel/arm/nrm2_vfp.S index b3bd28152c..16ac5a6324 100644 --- a/kernel/arm/nrm2_vfp.S +++ b/kernel/arm/nrm2_vfp.S @@ -573,6 +573,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 +#endif +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov r0, s0 +#else + vmov r0, r1, d0 +#endif #endif bx lr diff --git a/kernel/arm/nrm2_vfpv3.S b/kernel/arm/nrm2_vfpv3.S index 7af9668953..84977901d7 100644 --- a/kernel/arm/nrm2_vfpv3.S +++ b/kernel/arm/nrm2_vfpv3.S @@ -503,8 +503,13 @@ nrm2_kernel_L999: #else vsqrt.f32 s1, s1 vmul.f32 s0, s0, s1 -#ifdef ARM_SOFTFP_ABI - vmov r0, s0 +#endif + +#if !defined(__ARM_PCS_VFP) +#if defined(DOUBLE) + vmov r0, r1, d0 +#else + vmov r0, s0 #endif #endif From 83bd547517e0a7df5b60ae1e6165c9d3528a07e4 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sat, 1 Jul 2017 20:37:40 +0530 Subject: [PATCH 09/16] arm: add softfp support in kernel/arm/swap_vfp.S --- kernel/arm/KERNEL.ARMV6 | 10 +++++----- kernel/arm/swap_vfp.S | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index be51e83b8b..86d3dabaa6 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -57,6 +57,11 @@ DNRM2KERNEL = nrm2_vfp.S CNRM2KERNEL = nrm2_vfp.S ZNRM2KERNEL = nrm2_vfp.S +SSWAPKERNEL = swap_vfp.S +DSWAPKERNEL = swap_vfp.S +CSWAPKERNEL = swap_vfp.S +ZSWAPKERNEL = swap_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -96,11 +101,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SSWAPKERNEL = swap_vfp.S -DSWAPKERNEL = swap_vfp.S -CSWAPKERNEL = swap_vfp.S -ZSWAPKERNEL = swap_vfp.S - SGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S diff --git a/kernel/arm/swap_vfp.S b/kernel/arm/swap_vfp.S index 3528751887..76661da797 100644 --- a/kernel/arm/swap_vfp.S +++ b/kernel/arm/swap_vfp.S @@ -38,9 +38,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 +#if !defined(__ARM_PCS_VFP) + +#if !defined(COMPLEX) + +#if !defined(DOUBLE) +#define OLD_X [fp, #0 ] +#define OLD_INC_X [fp, #4 ] +#define OLD_Y [fp, #8 ] +#define OLD_INC_Y [fp, #12 ] +#else +#define OLD_X [fp, #8 ] +#define OLD_INC_X [fp, #12] +#define OLD_Y [fp, #16] +#define OLD_INC_Y [fp, #20] +#endif + +#else //COMPLEX + +#if !defined(DOUBLE) +#define OLD_X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define OLD_Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#else +#define OLD_X [fp, #16] +#define OLD_INC_X [fp, #20] +#define OLD_Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#endif // !defined(__ARM_PCS_VFP) + +#else #define OLD_INC_X [fp, #0 ] #define OLD_Y [fp, #4 ] #define OLD_INC_Y [fp, #8 ] +#endif #define N r0 @@ -229,6 +263,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. push {r4 , fp} add fp, sp, #8 +#if !defined(__ARM_PCS_VFP) + ldr X, OLD_X +#endif ldr INC_X , OLD_INC_X ldr Y, OLD_Y ldr INC_Y , OLD_INC_Y From 8f83d3f961f57fb002d8c5359c32a8db50dcab5d Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 00:38:44 +0530 Subject: [PATCH 10/16] arm: add softfp support in vfp gemv kernels --- kernel/arm/KERNEL.ARMV6 | 20 +++++++------- kernel/arm/KERNEL.ARMV7 | 6 ++-- kernel/arm/cgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/cgemv_t_vfp.S | 28 +++++++++++++++---- kernel/arm/gemv_n_vfp.S | 44 +++++++++++++++++++++++++---- kernel/arm/gemv_n_vfpv3.S | 58 +++++++++++++++++++++++---------------- kernel/arm/gemv_t_vfp.S | 54 ++++++++++++++++++++++-------------- kernel/arm/gemv_t_vfpv3.S | 44 +++++++++++++++++++++++++---- kernel/arm/zgemv_n_vfp.S | 28 +++++++++++++++---- kernel/arm/zgemv_t_vfp.S | 28 +++++++++++++++---- 10 files changed, 252 insertions(+), 86 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 86d3dabaa6..022a931832 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -62,6 +62,16 @@ DSWAPKERNEL = swap_vfp.S CSWAPKERNEL = swap_vfp.S ZSWAPKERNEL = swap_vfp.S +SGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n_vfp.S +CGEMVNKERNEL = cgemv_n_vfp.S +ZGEMVNKERNEL = zgemv_n_vfp.S + +SGEMVTKERNEL = gemv_t_vfp.S +DGEMVTKERNEL = gemv_t_vfp.S +CGEMVTKERNEL = cgemv_t_vfp.S +ZGEMVTKERNEL = zgemv_t_vfp.S + SGEMMKERNEL = ../generic/gemmkernel_4x2.c ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -101,16 +111,6 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S -CGEMVNKERNEL = cgemv_n_vfp.S -ZGEMVNKERNEL = zgemv_n_vfp.S - -SGEMVTKERNEL = gemv_t_vfp.S -DGEMVTKERNEL = gemv_t_vfp.S -CGEMVTKERNEL = cgemv_t_vfp.S -ZGEMVTKERNEL = zgemv_t_vfp.S - STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index f4823b70ab..0872cb8cd8 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -5,6 +5,9 @@ DNRM2KERNEL = nrm2_vfpv3.S CNRM2KERNEL = nrm2_vfpv3.S ZNRM2KERNEL = nrm2_vfpv3.S +SGEMVNKERNEL = gemv_n_vfpv3.S +DGEMVNKERNEL = gemv_n_vfpv3.S + STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c @@ -22,9 +25,6 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -SGEMVNKERNEL = gemv_n_vfpv3.S -DGEMVNKERNEL = gemv_n_vfpv3.S - STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/cgemv_n_vfp.S b/kernel/arm/cgemv_n_vfp.S index 5d27486443..4a1cd2d458 100644 --- a/kernel/arm/cgemv_n_vfp.S +++ b/kernel/arm/cgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -462,6 +474,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble cgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr s0 , ALPHA_R diff --git a/kernel/arm/cgemv_t_vfp.S b/kernel/arm/cgemv_t_vfp.S index 76c8a8f189..e1c750c859 100644 --- a/kernel/arm/cgemv_t_vfp.S +++ b/kernel/arm/cgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR r3 +#define OLD_ALPHAI [fp, #0 ] +#define OLD_A_SOFTFP [fp, #4 ] +#define OLD_LDA [fp, #8 ] +#define X [fp, #12 ] +#define OLD_INC_X [fp, #16 ] +#define Y [fp, #20 ] +#define OLD_INC_Y [fp, #24 ] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -359,6 +371,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble cgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vmov s0, OLD_ALPHAR + vldr s1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/gemv_n_vfp.S b/kernel/arm/gemv_n_vfp.S index 385370b7fe..7c154d7418 100644 --- a/kernel/arm/gemv_n_vfp.S +++ b/kernel/arm/gemv_n_vfp.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_M r0 @@ -508,6 +533,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M diff --git a/kernel/arm/gemv_n_vfpv3.S b/kernel/arm/gemv_n_vfpv3.S index 93bf23e497..54f958b7be 100644 --- a/kernel/arm/gemv_n_vfpv3.S +++ b/kernel/arm/gemv_n_vfpv3.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + #endif +#define OLD_A r3 #define OLD_M r0 #define AO1 r0 @@ -565,18 +577,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble gemvn_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A str OLD_M, M - - + ldr INC_X , OLD_INC_X ldr INC_Y , OLD_INC_Y diff --git a/kernel/arm/gemv_t_vfp.S b/kernel/arm/gemv_t_vfp.S index 816be54ff8..9559d18296 100644 --- a/kernel/arm/gemv_t_vfp.S +++ b/kernel/arm/gemv_t_vfp.S @@ -38,25 +38,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#ifndef ARM_SOFTFP_ABI -//hard abi -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] -#define OLD_A r3 -#else -#define OLD_A_SOFTFP [fp, #0 ] -#define OLD_LDA [fp, #4 ] -#define X [fp, #8 ] -#define OLD_INC_X [fp, #12 ] -#define Y [fp, #16 ] -#define OLD_INC_Y [fp, #20 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) #define OLD_ALPHA r3 -#define OLD_A r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] #endif +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + +#define OLD_A r3 #define OLD_N r1 #define M r0 @@ -518,11 +530,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 -#ifndef DOUBLE -#ifdef ARM_SOFTFP_ABI - vmov s0, OLD_ALPHA - ldr OLD_A, OLD_A_SOFTFP +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA #endif + ldr OLD_A, OLD_A_SOFTFP #endif str OLD_A, A diff --git a/kernel/arm/gemv_t_vfpv3.S b/kernel/arm/gemv_t_vfpv3.S index 7ae5799bc8..b1d3dadf16 100644 --- a/kernel/arm/gemv_t_vfpv3.S +++ b/kernel/arm/gemv_t_vfpv3.S @@ -38,11 +38,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) + +#if !defined(DOUBLE) +#define OLD_ALPHA r3 +#define OLD_A_SOFTFP [fp, #0 ] +#define OLD_LDA [fp, #4 ] +#define X [fp, #8 ] +#define OLD_INC_X [fp, #12 ] +#define Y [fp, #16 ] +#define OLD_INC_Y [fp, #20 ] +#else +#define OLD_ALPHA [fp, #0 ] +#define OLD_A_SOFTFP [fp, #8 ] +#define OLD_LDA [fp, #12] +#define X [fp, #16] +#define OLD_INC_X [fp, #20] +#define Y [fp, #24] +#define OLD_INC_Y [fp, #28] +#endif + +#else + +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] + +#endif + #define OLD_A r3 #define OLD_N r1 @@ -476,6 +501,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble gemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) +#if !defined(DOUBLE) + vmov s0, OLD_ALPHA +#else + vldr d0, OLD_ALPHA +#endif + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N diff --git a/kernel/arm/zgemv_n_vfp.S b/kernel/arm/zgemv_n_vfp.S index da9a91043e..7d55678491 100644 --- a/kernel/arm/zgemv_n_vfp.S +++ b/kernel/arm/zgemv_n_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_M r0 @@ -465,6 +477,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp N, #0 ble zgemvn_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_M, M vstr d0 , ALPHA_R diff --git a/kernel/arm/zgemv_t_vfp.S b/kernel/arm/zgemv_t_vfp.S index 211fa07011..407026166a 100644 --- a/kernel/arm/zgemv_t_vfp.S +++ b/kernel/arm/zgemv_t_vfp.S @@ -38,11 +38,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STACKSIZE 256 -#define OLD_LDA [fp, #0 ] -#define X [fp, #4 ] -#define OLD_INC_X [fp, #8 ] -#define Y [fp, #12 ] -#define OLD_INC_Y [fp, #16 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR [fp, #0 ] +#define OLD_ALPHAI [fp, #8 ] +#define OLD_A_SOFTFP [fp, #16] +#define OLD_LDA [fp, #20] +#define X [fp, #24] +#define OLD_INC_X [fp, #28] +#define Y [fp, #32] +#define OLD_INC_Y [fp, #36] +#else +#define OLD_LDA [fp, #0 ] +#define X [fp, #4 ] +#define OLD_INC_X [fp, #8 ] +#define Y [fp, #12 ] +#define OLD_INC_Y [fp, #16 ] +#endif + #define OLD_A r3 #define OLD_N r1 @@ -360,6 +372,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp OLD_N, #0 ble zgemvt_kernel_L999 +#if !defined(__ARM_PCS_VFP) + vldr d0, OLD_ALPHAR + vldr d1, OLD_ALPHAI + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_A, A str OLD_N, N From eda9e8632ab7d94d609006612a4b760214dfa847 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:00:48 +0530 Subject: [PATCH 11/16] generic: Bug fixes in generic 4x2 and 4x4 gemm kernels --- kernel/generic/gemmkernel_4x2.c | 30 ++++----- kernel/generic/gemmkernel_4x4.c | 104 ++++++++++++++++---------------- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c index 1d15de1d7b..8c784e2f12 100644 --- a/kernel/generic/gemmkernel_4x2.c +++ b/kernel/generic/gemmkernel_4x2.c @@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C0 = C0+1; - C1 = C1+1; + C0 += C0+1; + C1 += C1+1; } @@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL C0[0] = res0_0; - C0 = C0+1; + C0 += C0+1; } k = (bk<<0); diff --git a/kernel/generic/gemmkernel_4x4.c b/kernel/generic/gemmkernel_4x4.c index bd67b3fc8a..99bd9c1ef8 100644 --- a/kernel/generic/gemmkernel_4x4.c +++ b/kernel/generic/gemmkernel_4x4.c @@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_2 *= alpha; res3_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; - - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; - - C2[0] = res2_0; - C2[1] = res2_1; - C2[2] = res2_2; - C2[3] = res2_3; - - C3[0] = res3_0; - C3[1] = res3_1; - C3[2] = res3_2; - C3[3] = res3_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; + + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; + + C2[0] += res2_0; + C2[1] += res2_1; + C2[2] += res2_2; + C2[3] += res2_3; + + C3[0] += res3_0; + C3[1] += res3_1; + C3[2] += res3_2; + C3[3] += res3_3; C0 = C0+4; C1 = C1+4; @@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; res3_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; - C2[0] = res2_0; - C2[1] = res2_1; + C2[0] += res2_0; + C2[1] += res2_1; - C3[0] = res3_0; - C3[1] = res3_1; + C3[0] += res3_0; + C3[1] += res3_1; C0 = C0+2; C1 = C1+2; @@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res3_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; - C2[0] = res2_0; + C2[0] += res2_0; - C3[0] = res3_0; + C3[0] += res3_0; C0 = C0+1; C1 = C1+1; @@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_2 *= alpha; res1_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; - C1[0] = res1_0; - C1[1] = res1_1; - C1[2] = res1_2; - C1[3] = res1_3; + C1[0] += res1_0; + C1[1] += res1_1; + C1[2] += res1_2; + C1[3] += res1_3; C0 = C0+4; C1 = C1+4; @@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; res1_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; - C1[0] = res1_0; - C1[1] = res1_1; + C1[0] += res1_0; + C1[1] += res1_1; C0 = C0+2; C1 = C1+2; @@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res1_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; - C1[0] = res1_0; + C1[0] += res1_0; C0 = C0+1; C1 = C1+1; @@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_2 *= alpha; res0_3 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; - C0[2] = res0_2; - C0[3] = res0_3; + C0[0] += res0_0; + C0[1] += res0_1; + C0[2] += res0_2; + C0[3] += res0_3; C0 = C0+4; @@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; res0_1 *= alpha; - C0[0] = res0_0; - C0[1] = res0_1; + C0[0] += res0_0; + C0[1] += res0_1; C0 = C0+2; @@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL res0_0 *= alpha; - C0[0] = res0_0; + C0[0] += res0_0; C0 = C0+1; From 872a11a2bfd90225d5ace725b0ec4f59bd9291f3 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:05:48 +0530 Subject: [PATCH 12/16] arm: add softfp support in sgemm/strmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 3 +-- kernel/arm/sgemm_kernel_4x2_vfp.S | 12 ++++++++++++ kernel/arm/sgemm_kernel_4x4_vfpv3.S | 29 +++++++++++------------------ kernel/arm/strmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/strmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 52 insertions(+), 22 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 022a931832..18d9869de6 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -73,6 +73,7 @@ CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S SGEMMKERNEL = ../generic/gemmkernel_4x2.c +SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S SGEMMITCOPY = sgemm_tcopy_4_vfp.S @@ -97,6 +98,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = ../generic/trmmkernel_4x2.c +STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = ../generic/trmmkernel_4x2.c CGEMMONCOPY = cgemm_ncopy_2_vfp.S @@ -111,12 +113,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -SGEMMKERNEL = sgemm_kernel_4x2_vfp.S DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 0872cb8cd8..e2044133db 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -11,7 +11,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = ../generic/trmmkernel_4x4.c DTRMMKERNEL = ../generic/trmmkernel_4x4.c -SGEMMKERNEL = ../generic/gemmkernel_4x4.c +SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o @@ -30,7 +30,6 @@ DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/sgemm_kernel_4x2_vfp.S b/kernel/arm/sgemm_kernel_4x2_vfp.S index e8b44b742d..1f21e5a1f8 100644 --- a/kernel/arm/sgemm_kernel_4x2_vfp.S +++ b/kernel/arm/sgemm_kernel_4x2_vfp.S @@ -62,9 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -416,6 +424,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/sgemm_kernel_4x4_vfpv3.S b/kernel/arm/sgemm_kernel_4x4_vfpv3.S index 86198ac906..6491d35718 100644 --- a/kernel/arm/sgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/sgemm_kernel_4x4_vfpv3.S @@ -58,14 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define OLD_M r0 #define OLD_N r1 #define OLD_K r2 - -#ifdef ARM_SOFTFP_ABI -#define OLD_ALPHA r3 -//#define OLD_A -#else //hard #define OLD_A r3 #define OLD_ALPHA s0 -#endif /****************************************************** * [fp, #-128] - [fp, #-64] is reserved @@ -77,10 +71,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define M [fp, #-256 ] #define N [fp, #-260 ] #define K [fp, #-264 ] - -#ifndef ARM_SOFTFP_ABI #define A [fp, #-268 ] -#endif #define FP_ZERO [fp, #-240] #define FP_ZERO_0 [fp, #-240] @@ -88,17 +79,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] -#ifdef ARM_SOFTFP_ABI -#define A [fp, #4 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] #define B [fp, #8 ] #define C [fp, #12 ] #define OLD_LDC [fp, #16 ] -#else //hard +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #endif - + #define I r0 #define J r1 #define L r2 @@ -867,16 +859,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif + str OLD_M, M str OLD_N, N str OLD_K, K - -#ifdef ARM_SOFTFP_ABI - str OLD_ALPHA, ALPHA -#else //hard str OLD_A, A vstr OLD_ALPHA, ALPHA -#endif + sub r3, fp, #128 vstm r3, { s8 - s31} // store floating point registers diff --git a/kernel/arm/strmm_kernel_4x2_vfp.S b/kernel/arm/strmm_kernel_4x2_vfp.S index 8f97644eca..635b1dd13d 100644 --- a/kernel/arm/strmm_kernel_4x2_vfp.S +++ b/kernel/arm/strmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define OLD_C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -395,6 +404,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/strmm_kernel_4x4_vfpv3.S b/kernel/arm/strmm_kernel_4x4_vfpv3.S index 0dd03ac850..e24d24ebad 100644 --- a/kernel/arm/strmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/strmm_kernel_4x4_vfpv3.S @@ -64,10 +64,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP r3 +#define OLD_A_SOFTFP [fp, #4 ] +#define B [fp, #8 ] +#define C [fp, #12 ] +#define OLD_LDC [fp, #16 ] +#define OFFSET [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -782,6 +791,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 09bc6ebe5b26aecd405a25dad2fa2934642fc827 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:24:38 +0530 Subject: [PATCH 13/16] arm: add softfp support in dgemm/dtrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 8 ++------ kernel/arm/KERNEL.ARMV7 | 10 +++------- kernel/arm/dgemm_kernel_4x2_vfp.S | 13 ++++++++++++- kernel/arm/dgemm_kernel_4x4_vfpv3.S | 12 ++++++++++++ kernel/arm/dtrmm_kernel_4x2_vfp.S | 13 +++++++++++++ kernel/arm/dtrmm_kernel_4x4_vfpv3.S | 13 +++++++++++++ 6 files changed, 55 insertions(+), 14 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 18d9869de6..622085b457 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -72,7 +72,6 @@ DGEMVTKERNEL = gemv_t_vfp.S CGEMVTKERNEL = cgemv_t_vfp.S ZGEMVTKERNEL = zgemv_t_vfp.S -SGEMMKERNEL = ../generic/gemmkernel_4x2.c SGEMMKERNEL = sgemm_kernel_4x2_vfp.S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) SGEMMINCOPY = sgemm_ncopy_4_vfp.S @@ -85,7 +84,7 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x2.c +DGEMMKERNEL = dgemm_kernel_4x2_vfp.S ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) DGEMMINCOPY = dgemm_ncopy_4_vfp.S DGEMMITCOPY = dgemm_tcopy_4_vfp.S @@ -97,9 +96,8 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = ../generic/trmmkernel_4x2.c STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = ../generic/trmmkernel_4x2.c +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S @@ -113,11 +111,9 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -DGEMMKERNEL = dgemm_kernel_4x2_vfp.S CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index e2044133db..63c468e66f 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,8 +8,8 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_4x4.c +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -17,7 +17,7 @@ SGEMMOTCOPY = sgemm_tcopy_4_vfp.S SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_4x4.c +DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S DGEMMONCOPY = dgemm_ncopy_4_vfp.S DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o @@ -25,13 +25,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o ifeq ($(ARM_ABI),hard) -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -DGEMMKERNEL = dgemm_kernel_4x4_vfpv3.S - CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S diff --git a/kernel/arm/dgemm_kernel_4x2_vfp.S b/kernel/arm/dgemm_kernel_4x2_vfp.S index 183269d1be..001a6050c7 100644 --- a/kernel/arm/dgemm_kernel_4x2_vfp.S +++ b/kernel/arm/dgemm_kernel_4x2_vfp.S @@ -62,10 +62,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] - +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -429,6 +436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dgemm_kernel_4x4_vfpv3.S b/kernel/arm/dgemm_kernel_4x4_vfpv3.S index b14052e068..1744b54d8a 100644 --- a/kernel/arm/dgemm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dgemm_kernel_4x4_vfpv3.S @@ -79,9 +79,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -878,6 +886,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x2_vfp.S b/kernel/arm/dtrmm_kernel_4x2_vfp.S index c578d2b1e6..3d6fbf8e9d 100644 --- a/kernel/arm/dtrmm_kernel_4x2_vfp.S +++ b/kernel/arm/dtrmm_kernel_4x2_vfp.S @@ -65,10 +65,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -404,6 +413,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S index c7e455f160..c0c6a16777 100644 --- a/kernel/arm/dtrmm_kernel_4x4_vfpv3.S +++ b/kernel/arm/dtrmm_kernel_4x4_vfpv3.S @@ -66,10 +66,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA [fp, #-276 ] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHA_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #12 ] +#define B [fp, #16 ] +#define OLD_C [fp, #20 ] +#define OLD_LDC [fp, #24 ] +#define OFFSET [fp, #28 ] +#else #define B [fp, #4 ] #define OLD_C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +855,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA, OLD_ALPHA_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 305cd2e8b41f4daccdfa1e6631bce7f7133faf92 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:42:32 +0530 Subject: [PATCH 14/16] arm: add softfp support in cgemm/ctrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 4 ++-- kernel/arm/KERNEL.ARMV7 | 5 +++-- kernel/arm/cgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/cgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ctrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 622085b457..e8fc3df73b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -98,7 +98,9 @@ DGEMMOTCOPYOBJ = dgemm_otcopy.o STRMMKERNEL = strmm_kernel_4x2_vfp.S DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S +CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o @@ -111,10 +113,8 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy.o ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -CGEMMKERNEL = cgemm_kernel_2x2_vfp.S ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 63c468e66f..4bfe18d1d9 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -10,6 +10,7 @@ DGEMVNKERNEL = gemv_n_vfpv3.S STRMMKERNEL = strmm_kernel_4x4_vfpv3.S DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S @@ -23,12 +24,12 @@ DGEMMOTCOPY = dgemm_tcopy_4_vfp.S DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S + ifeq ($(ARM_ABI),hard) -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S endif diff --git a/kernel/arm/cgemm_kernel_2x2_vfp.S b/kernel/arm/cgemm_kernel_2x2_vfp.S index f0517cb47e..512eea387d 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfp.S +++ b/kernel/arm/cgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -816,6 +825,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/cgemm_kernel_2x2_vfpv3.S b/kernel/arm/cgemm_kernel_2x2_vfpv3.S index cf132a1849..42eb53a55e 100644 --- a/kernel/arm/cgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/cgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -873,6 +882,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfp.S b/kernel/arm/ctrmm_kernel_2x2_vfp.S index 8cb7ede9da..95578b10a4 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfp.S @@ -67,10 +67,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -826,6 +836,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S index 97bd88c69c..18beb4e476 100644 --- a/kernel/arm/ctrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ctrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP r3 +#define OLD_ALPHAI_SOFTFP [fp, #4] +#define OLD_A_SOFTFP [fp, #8 ] +#define B [fp, #12 ] +#define C [fp, #16 ] +#define OLD_LDC [fp, #20 ] +#define OFFSET [fp, #24 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -846,6 +856,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 97d671eb610de8cd73fa90923bfbed87d1d8ffef Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 02:54:32 +0530 Subject: [PATCH 15/16] arm: add softfp support in zgemm/ztrmm vfp kernels --- kernel/arm/KERNEL.ARMV6 | 13 ++++--------- kernel/arm/KERNEL.ARMV7 | 13 ++++--------- kernel/arm/zgemm_kernel_2x2_vfp.S | 14 ++++++++++++++ kernel/arm/zgemm_kernel_2x2_vfpv3.S | 14 ++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfp.S | 15 +++++++++++++++ kernel/arm/ztrmm_kernel_2x2_vfpv3.S | 15 +++++++++++++++ 6 files changed, 66 insertions(+), 18 deletions(-) diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index e8fc3df73b..960dae67b0 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -96,25 +96,20 @@ DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -STRMMKERNEL = strmm_kernel_4x2_vfp.S -DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S - CGEMMKERNEL = cgemm_kernel_2x2_vfp.S CGEMMONCOPY = cgemm_ncopy_2_vfp.S CGEMMOTCOPY = cgemm_tcopy_2_vfp.S CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S ZGEMMONCOPY = zgemm_ncopy_2_vfp.S ZGEMMOTCOPY = zgemm_tcopy_2_vfp.S ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o -ifeq ($(ARM_ABI),hard) - +STRMMKERNEL = strmm_kernel_4x2_vfp.S +DTRMMKERNEL = dtrmm_kernel_4x2_vfp.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfp.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfp.S -ZGEMMKERNEL = zgemm_kernel_2x2_vfp.S - -endif diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 4bfe18d1d9..5e0b4cfb81 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -8,10 +8,6 @@ ZNRM2KERNEL = nrm2_vfpv3.S SGEMVNKERNEL = gemv_n_vfpv3.S DGEMVNKERNEL = gemv_n_vfpv3.S -STRMMKERNEL = strmm_kernel_4x4_vfpv3.S -DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S -CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S - SGEMMKERNEL = sgemm_kernel_4x4_vfpv3.S SGEMMONCOPY = sgemm_ncopy_4_vfp.S SGEMMOTCOPY = sgemm_tcopy_4_vfp.S @@ -25,11 +21,10 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_2x2_vfpv3.S +ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S -ifeq ($(ARM_ABI),hard) - +STRMMKERNEL = strmm_kernel_4x4_vfpv3.S +DTRMMKERNEL = dtrmm_kernel_4x4_vfpv3.S +CTRMMKERNEL = ctrmm_kernel_2x2_vfpv3.S ZTRMMKERNEL = ztrmm_kernel_2x2_vfpv3.S -ZGEMMKERNEL = zgemm_kernel_2x2_vfpv3.S - -endif diff --git a/kernel/arm/zgemm_kernel_2x2_vfp.S b/kernel/arm/zgemm_kernel_2x2_vfp.S index 46507c4d21..618f09781d 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfp.S +++ b/kernel/arm/zgemm_kernel_2x2_vfp.S @@ -64,9 +64,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -863,6 +872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/zgemm_kernel_2x2_vfpv3.S b/kernel/arm/zgemm_kernel_2x2_vfpv3.S index 5a99f792ff..0fe0c19932 100644 --- a/kernel/arm/zgemm_kernel_2x2_vfpv3.S +++ b/kernel/arm/zgemm_kernel_2x2_vfpv3.S @@ -80,9 +80,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] +#endif #define I r0 #define J r1 @@ -909,6 +918,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfp.S b/kernel/arm/ztrmm_kernel_2x2_vfp.S index dc80b17b87..78d09a9c7e 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfp.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfp.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -882,6 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K diff --git a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S index 5a808ccbc1..bf72ce605c 100644 --- a/kernel/arm/ztrmm_kernel_2x2_vfpv3.S +++ b/kernel/arm/ztrmm_kernel_2x2_vfpv3.S @@ -66,10 +66,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ALPHA_I [fp, #-272] #define ALPHA_R [fp, #-280] +#if !defined(__ARM_PCS_VFP) +#define OLD_ALPHAR_SOFTFP [fp, #4] +#define OLD_ALPHAI_SOFTFP [fp, #12] +#define OLD_A_SOFTFP [fp, #20 ] +#define B [fp, #24 ] +#define C [fp, #28 ] +#define OLD_LDC [fp, #32 ] +#define OFFSET [fp, #36 ] +#else #define B [fp, #4 ] #define C [fp, #8 ] #define OLD_LDC [fp, #12 ] #define OFFSET [fp, #16 ] +#endif #define I r0 #define J r1 @@ -883,6 +893,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add fp, sp, #24 sub sp, sp, #STACKSIZE // reserve stack +#if !defined(__ARM_PCS_VFP) + vldr OLD_ALPHA_R, OLD_ALPHAR_SOFTFP + vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP + ldr OLD_A, OLD_A_SOFTFP +#endif str OLD_M, M str OLD_N, N str OLD_K, K From 37efb5bc1d9b78e5e612b5aad896981d58a5d18f Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Sun, 2 Jul 2017 03:06:36 +0530 Subject: [PATCH 16/16] arm: Remove unnecessary files/code Since softfp code has been added to all required vfp kernels, the code for auto detection of abi is no longer required. The option to force softfp ABI on make command line by giving ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option anymore. Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed. These are longer required. Moreover these kernels has bugs. --- Makefile.system | 19 +- c_check | 16 +- kernel/generic/gemmkernel_4x2.c | 317 ------------------ kernel/generic/gemmkernel_4x4.c | 571 -------------------------------- kernel/generic/trmmkernel_4x2.c | 528 ----------------------------- 5 files changed, 8 insertions(+), 1443 deletions(-) delete mode 100644 kernel/generic/gemmkernel_4x2.c delete mode 100644 kernel/generic/gemmkernel_4x4.c delete mode 100644 kernel/generic/trmmkernel_4x2.c diff --git a/Makefile.system b/Makefile.system index 2cae5f1c9d..4face0e51b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -487,19 +487,14 @@ ifeq ($(ARCH), arm) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 -# If ABI is specified on command line use it. Else use the automatically detected ABI. -ifeq ($(ARM_SOFTFP_ABI),1) -ARM_ABI = softfp -else -ifeq ($(ARM_HARD_ABI),1) -ARM_ABI = hard -else -ARM_ABI=$(ARM_ABI_AUTO) -endif +CCOMMON_OPT += -marm +FCOMMON_OPT += -marm + +# If softfp abi is mentioned on the command line, force it. +ifeq ($(ARM_SOFTFP_ABI), 1) +CCOMMON_OPT += -mfloat-abi=softfp +FCOMMON_OPT += -mfloat-abi=softfp endif -export ARM_ABI_AUTO -CCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) -FCOMMON_OPT += -marm -mfloat-abi=$(ARM_ABI) endif diff --git a/c_check b/c_check index 2e7e08cfba..20da288bec 100644 --- a/c_check +++ b/c_check @@ -94,17 +94,7 @@ if ($architecture eq "mips64") { $defined = 1; } -if ($architecture eq "arm") { - $defined = 1; - $data = `$compiler_name -dM -E ctest2.c | grep -w __ARM_PCS_VFP`; - if ($data ne "") { - $abi = "hard"; - } else { - $abi = "softfp"; - } -} - -if ($architecture eq "arm64") { +if (($architecture eq "arm") || ($architecture eq "arm64")) { $defined = 1; } @@ -297,10 +287,6 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; -if ($architecture eq "arm") { - print MAKEFILE "ARM_ABI_AUTO=$abi\n"; -} - $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; $compiler =~ tr/[a-z]/[A-Z]/; diff --git a/kernel/generic/gemmkernel_4x2.c b/kernel/generic/gemmkernel_4x2.c deleted file mode 100644 index 8c784e2f12..0000000000 --- a/kernel/generic/gemmkernel_4x2.c +++ /dev/null @@ -1,317 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - for (j=0; j<(bn/2); j+=2) - { - C0 = C; - C1 = C0+ldc; - - ptrba = ba; - - for (i=0; i - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT res2_0; - FLOAT res2_1; - FLOAT res2_2; - FLOAT res2_3; - - FLOAT res3_0; - FLOAT res3_1; - FLOAT res3_2; - FLOAT res3_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - FLOAT b2; - FLOAT b3; - - - for (j=0; j - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - - FLOAT res0_0; - FLOAT res0_1; - FLOAT res0_2; - FLOAT res0_3; - - FLOAT res1_0; - FLOAT res1_1; - FLOAT res1_2; - FLOAT res1_3; - - FLOAT a0; - FLOAT a1; - - FLOAT b0; - FLOAT b1; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - for (j=0; j<(bn/2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - - ptrba = ba; - - for (i=0; i