Skip to content
Browse files

slightly speed up nmod_mat arithmetic with a full limb modulus by add…

…ing an add_sssaaaaaa macro to longlong.h (warning: not tested on 32-bit)
  • Loading branch information...
1 parent 4bdf90e commit 9824bad714eda764a362e3027da8ba83dbc7ab38 @fredrik-johansson fredrik-johansson committed Apr 8, 2011
Showing with 108 additions and 15 deletions.
  1. +6 −0 doc/longlong.txt
  2. +23 −0 longlong.h
  3. +3 −8 nmod_mat/mul_3.c
  4. +2 −7 nmod_mat/solve_lu_precomp.c
  5. +74 −0 test/t-add_sssaaaaaa.c
View
6 doc/longlong.txt
@@ -23,6 +23,7 @@
Copyright (C) 1991, 1992, 1993, 1994, 1996, 1997, 1999, 2000, 2001,
2002, 2003, 2004, 2005 Free Software Foundation, Inc.
Copyright (C) 2009 William Hart
+ Copyright (C) 2011 Fredrik Johansson
******************************************************************************/
@@ -77,6 +78,11 @@ add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
\code{LOW_SUM}. Overflow, i.e.\ carry out, is not stored anywhere,
and is lost.
+add_sssaaaaaa(high_sum, mid_sum, low_sum, high_addend_1, mid_addend_1,
+ low_addend_1, high_addend_2, mid_addend_2, low_addend_2)
+
+ Adds two three limb integers. Carry out is lost.
+
sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
high_subtrahend, low_subtrahend)
View
23 longlong.h
@@ -3,6 +3,7 @@
2004, 2005 Free Software Foundation, Inc.
Copyright 2009 William Hart
+ Copyright 2011 Fredrik Johansson
This file is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
@@ -30,6 +31,13 @@
/* x86 : 64 bit */
#if (__GMP_BITS_PER_MP_LIMB == 64 && defined (__amd64__))
+#define add_sssaaaaaa(sh, sm, sl, ah, am, al, bh, bm, bl) \
+ __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0" \
+ : "=r" (sh), "=r" (sm), "=&r" (sl) \
+ : "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \
+ "1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \
+ "2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \
+
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
: "=r" (sh), "=&r" (sl) \
@@ -79,6 +87,13 @@
/* x86 : 32 bit */
#if (__GMP_BITS_PER_MP_LIMB == 32 && (defined (__i386__) || defined (__i486__) || defined(__amd64__)))
+#define add_sssaaaaaa(sh, sm, sl, ah, am, al, bh, bm, bl) \
+ __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0" \
+ : "=r" (sh), "=r" (sm), "=&r" (sl) \
+ : "0" ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)), \
+ "1" ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)), \
+ "2" ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl))) \
+
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
__asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
: "=r" (sh), "=&r" (sl) \
@@ -141,6 +156,14 @@
(sl) = __x; \
} while (0)
+#define add_sssaaaaaa(sh, sm, sl, ah, am, al, bh, bm, bl) \
+ do { \
+ mp_limb_t __x, __y; \
+ add_ssaaaa(__x, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl); \
+ add_ssaaaa(__y, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm); \
+ add_ssaaaa(sh, sm, sh, sm, __y, __x); \
+ } while (0)
+
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
do { \
mp_limb_t __x; \
View
11 nmod_mat/mul_3.c
@@ -28,6 +28,7 @@
#include "flint.h"
#include "nmod_mat.h"
#include "nmod_vec.h"
+#include "longlong.h"
void
@@ -37,7 +38,6 @@ _nmod_mat_mul_3(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
register mp_limb_t s0, s1, s2;
register mp_limb_t t0, t1;
- register mp_limb_t c1, c2;
for (i = 0; i < A->r; i++)
{
@@ -48,9 +48,7 @@ _nmod_mat_mul_3(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
for (k = 0; k < A->c; k++)
{
umul_ppmm(t1, t0, A->rows[i][k], B->rows[k][j]);
- add_ssaaaa(c1, s0, (mp_limb_t) 0, s0, (mp_limb_t) 0, t0);
- add_ssaaaa(c2, s1, (mp_limb_t) 0, s1, (mp_limb_t) 0, t1);
- add_ssaaaa(s2, s1, s2, s1, c2, c1);
+ add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t1, t0);
}
NMOD_RED(s2, s2, C->mod);
@@ -67,7 +65,6 @@ _nmod_mat_mul_transpose_3(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
register mp_limb_t s0, s1, s2;
register mp_limb_t t0, t1;
- register mp_limb_t c1, c2;
for (i = 0; i < A->r; i++)
{
@@ -78,9 +75,7 @@ _nmod_mat_mul_transpose_3(nmod_mat_t C, const nmod_mat_t A, const nmod_mat_t B)
for (k = 0; k < A->c; k++)
{
umul_ppmm(t1, t0, A->rows[i][k], B->rows[j][k]);
- add_ssaaaa(c1, s0, (mp_limb_t) 0, s0, (mp_limb_t) 0, t0);
- add_ssaaaa(c2, s1, (mp_limb_t) 0, s1, (mp_limb_t) 0, t1);
- add_ssaaaa(s2, s1, s2, s1, c2, c1);
+ add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t1, t0);
}
NMOD_RED(s2, s2, C->mod);
View
9 nmod_mat/solve_lu_precomp.c
@@ -53,7 +53,6 @@ _nmod_mat_solve_lu_precomp(mp_limb_t * b, mp_limb_t ** const LU, long n,
int size;
long i, j;
register mp_limb_t s0, s1, s2;
- register mp_limb_t c1, c2;
register mp_limb_t t0, t1;
size = bound_size(n, mod.n - 1);
@@ -83,9 +82,7 @@ _nmod_mat_solve_lu_precomp(mp_limb_t * b, mp_limb_t ** const LU, long n,
for (j = 0; j < i; j++)
{
umul_ppmm(t1, t0, LU[i][j], b[j]);
- add_ssaaaa(c1, s0, (mp_limb_t) 0, s0, (mp_limb_t) 0, t0);
- add_ssaaaa(c2, s1, (mp_limb_t) 0, s1, (mp_limb_t) 0, t1);
- add_ssaaaa(s2, s1, s2, s1, c2, c1);
+ add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t1, t0);
}
NMOD_RED(s2, s2, mod);
NMOD_RED3(s0, s2, s1, s0, mod);
@@ -120,9 +117,7 @@ _nmod_mat_solve_lu_precomp(mp_limb_t * b, mp_limb_t ** const LU, long n,
for (j = i + 1; j < n; j++)
{
umul_ppmm(t1, t0, LU[i][j], b[j]);
- add_ssaaaa(c1, s0, (mp_limb_t) 0, s0, (mp_limb_t) 0, t0);
- add_ssaaaa(c2, s1, (mp_limb_t) 0, s1, (mp_limb_t) 0, t1);
- add_ssaaaa(s2, s1, s2, s1, c2, c1);
+ add_sssaaaaaa(s2, s1, s0, s2, s1, s0, 0, t1, t0);
}
NMOD_RED(s2, s2, mod);
NMOD_RED3(s0, s2, s1, s0, mod);
View
74 test/t-add_sssaaaaaa.c
@@ -0,0 +1,74 @@
+/*=============================================================================
+
+ This file is part of FLINT.
+
+ FLINT is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ FLINT is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with FLINT; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+=============================================================================*/
+/******************************************************************************
+
+ Copyright (C) 2009 William Hart
+ Copyright (C) 2011 Fredrik Johansson
+
+******************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpir.h>
+#include "flint.h"
+#include "ulong_extras.h"
+
+int main(void)
+{
+ int i, j, result;
+ flint_rand_t state;
+ flint_randinit(state);
+
+ printf("add_sssaaaaaa....");
+ fflush(stdout);
+
+ for (i = 0; i < 1000000; i++)
+ {
+ mp_limb_t s[3], t[3], a[3], b[3];
+
+ for (j = 0; j < 3; j++)
+ {
+ s[j] = n_randtest(state);
+ t[j] = n_randtest(state);
+ a[j] = n_randtest(state);
+ b[j] = n_randtest(state);
+ }
+
+ add_sssaaaaaa(s[2], s[1], s[0], a[2], a[1], a[0], b[2], b[1], b[0]);
+
+ mpn_add_n(t, a, b, 3);
+
+ result = ((s[2] == t[2]) && (s[1] == t[1]) && (s[0] == t[0]));
+ if (!result)
+ {
+ printf("FAIL:\n");
+ printf("a[2] = %lu, a[1] = %lu, a[0] = %lu\n", a[2], a[1], a[0]);
+ printf("b[2] = %lu, b[1] = %lu, b[0] = %lu\n", b[2], b[1], b[0]);
+ printf("s[2] = %lu, s[1] = %lu, s[0] = %lu\n", s[2], s[1], s[0]);
+ printf("t[2] = %lu, t[1] = %lu, t[0] = %lu\n", t[2], t[1], t[0]);
+ abort();
+ }
+ }
+
+ flint_randclear(state);
+
+ printf("PASS\n");
+ return 0;
+}

0 comments on commit 9824bad

Please sign in to comment.
Something went wrong with that request. Please try again.