-
Notifications
You must be signed in to change notification settings - Fork 0
/
hybrid.c
134 lines (115 loc) · 4.53 KB
/
hybrid.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include<stdio.h>
#include<stdlib.h>
#include<stdint.h>
typedef uint64_t utype64;
typedef int64_t type64;
typedef __int128 type128;
const utype64 lower58bits = 0x3ffffffffffffff;
const utype64 lower57bits = 0x1ffffffffffffff;
__inline__ utype64 rdtsc() {
uint32_t lo, hi;
__asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid"::: "%rax", "%rbx", "%rcx", "%rdx");
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (utype64)hi << 32 | lo;
}
__inline__ utype64 rdtscp() {
uint32_t lo, hi;
__asm__ __volatile__ ("rdtscp": "=a" (lo), "=d" (hi) :: "%rcx");
__asm__ __volatile__ ("cpuid"::: "%rax", "%rbx", "%rcx", "%rdx");
return (utype64)hi << 32 | lo;
}
/*Residue multiplication modulo 521-bit Mersenne prime
FG = Z (mod pow(2,521) - 1) where F and G comprise of 9-limb
F[i] and G[i] are unsigned 58-bit where i=1,2,3,4,5,6,7
F[0], G[0], and Z[0] are at most unsigned 59-bit
F[8], G[8], and Z[8] are unsigned 57-bit*/
void TMV_product(type64 *F, type64 *G, type64 *Z){
type64 T1[5]={0,0,0,2*F[6],2*F[5]}, T6[5];
type64 T5=2*F[8], c=2*F[7];
T1[0]=G[0]-G[3]; T1[1]=G[1]-G[4]; T1[2]=G[2]-G[5];
type128 X0=(type128)F[3]*T1[0]+(type128)F[2]*T1[1]+(type128)F[1]*T1[2];
type128 X1 = (type128)F[4]*T1[0]+(type128)F[3]*T1[1]+(type128)F[2]*T1[2];
type128 X2 = (type128)F[5]*T1[0]+(type128)F[4]*T1[1]+(type128)F[3]*T1[2];
T1[0]=G[3]-G[6]; T1[1]=G[4]-G[7]; T1[2]=G[5]-G[8];
type128 X6 = (type128)T1[3]*T1[0]+(type128)T1[4]*T1[1]+(type128)(2*F[4])*T1[2];
type128 X7 = (type128)c*T1[0]+(type128)T1[3]*T1[1]+(type128)T1[4]*T1[2];
type128 X8 = (type128)T5*T1[0]+(type128)c*T1[1]+(type128)T1[3]*T1[2];
T1[0]=G[0]-G[6]; T1[1]=G[1]-G[7]; T1[2]=G[2]-G[8];
type128 X3 = (type128)F[0]*T1[0]+(type128)T5*T1[1]+(type128)c*T1[2];
type128 X4 = (type128)F[1]*T1[0]+(type128)F[0]*T1[1]+(type128)T5*T1[2];
type128 X5 = (type128)F[2]*T1[0]+(type128)F[1]*T1[1]+(type128)F[0]*T1[2];
T6[0]=F[4]+F[1]; T6[1]=F[5]+F[2]; T6[2]=F[6]+F[3];
T6[3]=F[7]+F[4]; T6[4]=F[8]+F[5];
T1[0]=T6[0]+c; T1[1]=T6[1]+T5; T1[2]=T6[2]+F[0];
T1[3]=T6[3]+F[1]; T1[4]=T6[4]+F[2];
T6[0]+=T1[0]; T6[1]+=T1[1]; T6[2]+=T1[2];
T6[3]+=T1[3]; T6[4]+=T1[4];
T5=T1[2]+F[6];
type128 C = ((type128)T1[2]*G[2])+((type128)T1[3]*G[1])+((type128)T1[4]*G[0]) - X2 - X5;
c = ((type64)C)&lower57bits;
C = (((type128)T6[0]*G[8])+((type128)T6[1]*G[7])+((type128)T6[2]*G[6]) + X3 + X6) + (C>>57);
Z[0] = ((type64)C)&lower58bits;
C = (((type128)T6[1]*G[8])+((type128)T6[2]*G[7])+((type128)T6[3]*G[6]) + X4 + X7) + (C>>58);
Z[1] = ((type64)C)&lower58bits;
C = (((type128)T6[2]*G[8])+((type128)T6[3]*G[7])+((type128)T6[4]*G[6]) + X5 + X8) + (C>>58);
Z[2] = ((type64)C)&lower58bits;
C = (((type128)T6[3]*G[5])+((type128)T6[4]*G[4])+((type128)T5*G[3]) + X0 - X6) + (C>>58);
Z[3] = ((type64)C)&lower58bits;
C = (((type128)T6[4]*G[5])+((type128)T5*G[4])+((type128)T1[0]*G[3]) + X1 - X7) + (C>>58);
Z[4] = ((type64)C)&lower58bits;
C = (((type128)T5*G[5])+((type128)T1[0]*G[4])+((type128)T1[1]*G[3]) + X2 - X8) + (C>>58);
Z[5] = ((type64)C)&lower58bits;
C = (((type128)T1[0]*G[2])+((type128)T1[1]*G[1])+((type128)T1[2]*G[0]) - X0 - X3) + (C>>58);
Z[6] = ((type64)C)&lower58bits;
C = (((type128)T1[1]*G[2])+((type128)T1[2]*G[1])+((type128)T1[3]*G[0]) - X1 - X4) + (C>>58);
Z[7] = ((type64)C)&lower58bits;
c += ((type64)(C>>58));
Z[8] = c&lower57bits;
Z[0]+= (c>>57);
}
//////////////////////////////////////////////////////////////////////////////////////////////
int main(){
FILE *Fptr, *Gptr, *Zptr;
char file_F[] = "Fdata.txt", file_G[] = "Gdata.txt", file_Z[] = "Zdata.txt";
utype64 start, end, min_ccycle=10000;
type64 F[9], G[9], z[9];
unsigned int i, j, jj, k, values=1000;
if((Fptr=fopen(file_F, "r"))==NULL || (Gptr=fopen(file_G, "r"))==NULL){
printf("Cannot open the file(s) for reading.\n");
return 0;
}
if((Zptr=fopen(file_Z, "w"))==NULL ){
printf("Cannot open the file(s) for writing.\n");
return 0;
}
start=rdtsc();
end=rdtscp();
start=rdtsc();
end=rdtscp();
for(j=0; j<values; j++){
for(i=0; i<9; i++){
fscanf(Fptr,"%lu",&F[i]);
fscanf(Gptr,"%lu",&G[i]);
}
start=rdtsc();
for (jj=0;jj<1000;jj++)
{
TMV_product(F, G, z);
TMV_product(z, G, F);
}
end=rdtscp();
if((end - start)/2000 < min_ccycle)
min_ccycle = (end - start)/2000;
if(j%100 == 0){
printf("So far, the minimum cycle count is :: %lu\n", min_ccycle);
for(i=0; i<9; i++)
fprintf(Zptr, "%lu ", z[i]);
fprintf(Zptr, "\n");
}
}
fclose(Fptr);
fclose(Gptr);
fclose(Zptr);
printf("The minimum number of clock cycles is :: %lu\n", min_ccycle);
return 0;
}