/
global.h
616 lines (569 loc) · 16.8 KB
/
global.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
/* This file is part of the Vc library. {{{
Copyright © 2009-2015 Matthias Kretz <kretz@kde.org>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the names of contributing organizations nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}}}*/
#ifndef VC_GLOBAL_H_
#define VC_GLOBAL_H_
#include <cstdint>
#ifdef DOXYGEN
/**
* \name Compiler Identification Macros
* \ingroup Utilities
*/
//@{
/**
* \ingroup Utilities
* This macro is defined to a number identifying the ICC version if the current
* translation unit is compiled with the Intel compiler.
*
* For any other compiler this macro is not defined.
*/
#define Vc_ICC __INTEL_COMPILER_BUILD_DATE
#undef Vc_ICC
/**
* \ingroup Utilities
* This macro is defined to a number identifying the Clang version if the current
* translation unit is compiled with the Clang compiler.
*
* For any other compiler this macro is not defined.
*/
#define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#undef Vc_CLANG
/**
* \ingroup Utilities
* This macro is defined to a number identifying the Apple Clang version if the current
* translation unit is compiled with the Apple Clang compiler.
*
* For any other compiler this macro is not defined.
*/
#define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#undef Vc_APPLECLANG
/**
* \ingroup Utilities
* This macro is defined to a number identifying the GCC version if the current
* translation unit is compiled with the GCC compiler.
*
* For any other compiler this macro is not defined.
*/
#define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
/**
* \ingroup Utilities
* This macro is defined to a number identifying the Microsoft Visual C++ version if
* the current translation unit is compiled with the Visual C++ (MSVC) compiler.
*
* For any other compiler this macro is not defined.
*/
#define Vc_MSVC _MSC_FULL_VER
#undef Vc_MSVC
//@}
#else // DOXYGEN
// Compiler defines
#ifdef __INTEL_COMPILER
#define Vc_ICC __INTEL_COMPILER_BUILD_DATE
#elif defined(__OPENCC__)
#define Vc_OPEN64 1
#elif defined(__clang__) && defined(__APPLE__) && __clang_major__ >= 6
// this is going to break :-(
#define Vc_APPLECLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#elif defined(__clang__)
#define Vc_CLANG (__clang_major__ * 0x10000 + __clang_minor__ * 0x100 + __clang_patchlevel__)
#elif defined(__GNUC__)
#define Vc_GCC (__GNUC__ * 0x10000 + __GNUC_MINOR__ * 0x100 + __GNUC_PATCHLEVEL__)
#elif defined(_MSC_VER)
#define Vc_MSVC _MSC_FULL_VER
#else
#define Vc_UNSUPPORTED_COMPILER 1
#endif
#if __cplusplus < 201103 && (!defined Vc_MSVC || _MSC_VER < 1900)
# error "Vc requires support for C++11."
#elif __cplusplus >= 201402L
# define Vc_CXX14 1
# if __cplusplus > 201700L
# define Vc_CXX17 1
# endif
#endif
#if defined(__GNUC__) && !defined(Vc_NO_INLINE_ASM)
#define Vc_GNU_ASM 1
#endif
#ifdef Vc_GCC
# define Vc_HAVE_MAX_ALIGN_T 1
#elif !defined(Vc_CLANG) && !defined(Vc_ICC)
// Clang/ICC don't provide max_align_t at all
# define Vc_HAVE_STD_MAX_ALIGN_T 1
#endif
#if defined(Vc_GCC) || defined(Vc_CLANG) || defined Vc_APPLECLANG
#define Vc_USE_BUILTIN_VECTOR_TYPES 1
#endif
#ifdef Vc_MSVC
# define Vc_CDECL __cdecl
# define Vc_VDECL __vectorcall
#else
# define Vc_CDECL
# define Vc_VDECL
#endif
/* Define the following strings to a unique integer, which is the only type the preprocessor can
* compare. This allows to use -DVc_IMPL=SSE3. The preprocessor will then consider Vc_IMPL and SSE3
* to be equal. Of course, it is important to undefine the strings later on!
*/
#define Scalar 0x00100000
#define SSE 0x00200000
#define SSE2 0x00300000
#define SSE3 0x00400000
#define SSSE3 0x00500000
#define SSE4_1 0x00600000
#define SSE4_2 0x00700000
#define AVX 0x00800000
#define AVX2 0x00900000
#define MIC 0x00A00000
#define XOP 0x00000001
#define FMA4 0x00000002
#define F16C 0x00000004
#define POPCNT 0x00000008
#define SSE4a 0x00000010
#define FMA 0x00000020
#define BMI2 0x00000040
#define IMPL_MASK 0xFFF00000
#define EXT_MASK 0x000FFFFF
#ifdef Vc_MSVC
# ifdef _M_IX86_FP
# if _M_IX86_FP >= 1
# ifndef __SSE__
# define __SSE__ 1
# endif
# endif
# if _M_IX86_FP >= 2
# ifndef __SSE2__
# define __SSE2__ 1
# endif
# endif
# elif defined(_M_AMD64)
// If the target is x86_64 then SSE2 is guaranteed
# ifndef __SSE__
# define __SSE__ 1
# endif
# ifndef __SSE2__
# define __SSE2__ 1
# endif
# endif
#endif
#if defined Vc_ICC && !defined __POPCNT__
# if defined __SSE4_2__ || defined __SSE4A__
# define __POPCNT__ 1
# endif
#endif
#ifdef VC_IMPL
#error "You are using the old VC_IMPL macro. Since Vc 1.0 all Vc macros start with Vc_, i.e. a lower-case 'c'"
#endif
#ifndef Vc_IMPL
# if defined(__MIC__)
# define Vc_IMPL_MIC 1
# elif defined(__AVX2__)
# define Vc_IMPL_AVX2 1
# define Vc_IMPL_AVX 1
# elif defined(__AVX__)
# define Vc_IMPL_AVX 1
# else
# if defined(__SSE4_2__)
# define Vc_IMPL_SSE 1
# define Vc_IMPL_SSE4_2 1
# endif
# if defined(__SSE4_1__)
# define Vc_IMPL_SSE 1
# define Vc_IMPL_SSE4_1 1
# endif
# if defined(__SSE3__)
# define Vc_IMPL_SSE 1
# define Vc_IMPL_SSE3 1
# endif
# if defined(__SSSE3__)
# define Vc_IMPL_SSE 1
# define Vc_IMPL_SSSE3 1
# endif
# if defined(__SSE2__)
# define Vc_IMPL_SSE 1
# define Vc_IMPL_SSE2 1
# endif
# if defined(Vc_IMPL_SSE)
// nothing
# else
# define Vc_IMPL_Scalar 1
# endif
# endif
# if !defined(Vc_IMPL_Scalar)
# ifdef __FMA4__
# define Vc_IMPL_FMA4 1
# endif
# ifdef __XOP__
# define Vc_IMPL_XOP 1
# endif
# ifdef __F16C__
# define Vc_IMPL_F16C 1
# endif
# ifdef __POPCNT__
# define Vc_IMPL_POPCNT 1
# endif
# ifdef __SSE4A__
# define Vc_IMPL_SSE4a 1
# endif
# ifdef __FMA__
# define Vc_IMPL_FMA 1
# endif
# ifdef __BMI2__
# define Vc_IMPL_BMI2 1
# endif
# endif
#else // Vc_IMPL
# if (Vc_IMPL & IMPL_MASK) == MIC // MIC supersedes everything else
# define Vc_IMPL_MIC 1
# ifdef __POPCNT__
# define Vc_IMPL_POPCNT 1
# endif
# elif (Vc_IMPL & IMPL_MASK) == AVX2 // AVX2 supersedes SSE
# define Vc_IMPL_AVX2 1
# define Vc_IMPL_AVX 1
# elif (Vc_IMPL & IMPL_MASK) == AVX // AVX supersedes SSE
# define Vc_IMPL_AVX 1
# elif (Vc_IMPL & IMPL_MASK) == Scalar
# define Vc_IMPL_Scalar 1
# elif (Vc_IMPL & IMPL_MASK) == SSE4_2
# define Vc_IMPL_SSE4_2 1
# define Vc_IMPL_SSE4_1 1
# define Vc_IMPL_SSSE3 1
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE4_1
# define Vc_IMPL_SSE4_1 1
# define Vc_IMPL_SSSE3 1
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSSE3
# define Vc_IMPL_SSSE3 1
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE3
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE2
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# elif (Vc_IMPL & IMPL_MASK) == SSE
# define Vc_IMPL_SSE 1
# if defined(__SSE4_2__)
# define Vc_IMPL_SSE4_2 1
# endif
# if defined(__SSE4_1__)
# define Vc_IMPL_SSE4_1 1
# endif
# if defined(__SSE3__)
# define Vc_IMPL_SSE3 1
# endif
# if defined(__SSSE3__)
# define Vc_IMPL_SSSE3 1
# endif
# if defined(__SSE2__)
# define Vc_IMPL_SSE2 1
# endif
# elif (Vc_IMPL & IMPL_MASK) == 0 && (Vc_IMPL & SSE4a)
// this is for backward compatibility only where SSE4a was included in the main
// line of available SIMD instruction sets
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
# endif
# if (Vc_IMPL & XOP)
# define Vc_IMPL_XOP 1
# endif
# if (Vc_IMPL & FMA4)
# define Vc_IMPL_FMA4 1
# endif
# if (Vc_IMPL & F16C)
# define Vc_IMPL_F16C 1
# endif
# if (!defined(Vc_IMPL_Scalar) && defined(__POPCNT__)) || (Vc_IMPL & POPCNT)
# define Vc_IMPL_POPCNT 1
# endif
# if (Vc_IMPL & SSE4a)
# define Vc_IMPL_SSE4a 1
# endif
# if (Vc_IMPL & FMA)
# define Vc_IMPL_FMA 1
# endif
# if (Vc_IMPL & BMI2)
# define Vc_IMPL_BMI2 1
# endif
# undef Vc_IMPL
#endif // Vc_IMPL
// If AVX is enabled in the compiler it will use VEX coding for the SIMD instructions.
#ifdef __AVX__
# define Vc_USE_VEX_CODING 1
#endif
#ifdef Vc_IMPL_AVX
// if we have AVX then we also have all SSE intrinsics
# define Vc_IMPL_SSE4_2 1
# define Vc_IMPL_SSE4_1 1
# define Vc_IMPL_SSSE3 1
# define Vc_IMPL_SSE3 1
# define Vc_IMPL_SSE2 1
# define Vc_IMPL_SSE 1
#endif
#if defined(Vc_CLANG) && Vc_CLANG >= 0x30600 && Vc_CLANG < 0x30700
# if defined(Vc_IMPL_AVX)
# warning "clang 3.6.x miscompiles AVX code, frequently losing 50% of the data. Vc will fall back to SSE4 instead."
# undef Vc_IMPL_AVX
# if defined(Vc_IMPL_AVX2)
# undef Vc_IMPL_AVX2
# endif
# endif
#endif
# if !defined(Vc_IMPL_Scalar) && !defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_AVX) && !defined(Vc_IMPL_MIC)
# error "No suitable Vc implementation was selected! Probably Vc_IMPL was set to an invalid value."
# elif defined(Vc_IMPL_SSE) && !defined(Vc_IMPL_SSE2)
# error "SSE requested but no SSE2 support. Vc needs at least SSE2!"
# endif
#undef Scalar
#undef SSE
#undef SSE2
#undef SSE3
#undef SSSE3
#undef SSE4_1
#undef SSE4_2
#undef AVX
#undef AVX2
#undef MIC
#undef XOP
#undef FMA4
#undef F16C
#undef POPCNT
#undef SSE4a
#undef FMA
#undef BMI2
#undef IMPL_MASK
#undef EXT_MASK
#ifdef Vc_IMPL_MIC
#define Vc_DEFAULT_IMPL_MIC
#elif defined Vc_IMPL_AVX2
#define Vc_DEFAULT_IMPL_AVX2
#elif defined Vc_IMPL_AVX
#define Vc_DEFAULT_IMPL_AVX
#elif defined Vc_IMPL_SSE
#define Vc_DEFAULT_IMPL_SSE
#elif defined Vc_IMPL_Scalar
#define Vc_DEFAULT_IMPL_Scalar
#else
#error "Preprocessor logic broken. Please report a bug."
#endif
#define Vc_VERSIONED_NAMESPACE Vc_1
namespace Vc_VERSIONED_NAMESPACE {}
namespace Vc = Vc_VERSIONED_NAMESPACE;
#endif // DOXYGEN
namespace Vc_VERSIONED_NAMESPACE
{
typedef signed char int8_t;
typedef unsigned char uint8_t;
typedef signed short int16_t;
typedef unsigned short uint16_t;
typedef signed int int32_t;
typedef unsigned int uint32_t;
typedef signed long long int64_t;
typedef unsigned long long uint64_t;
/**
* \ingroup Utilities
*
* Enum that specifies the alignment and padding restrictions to use for memory allocation with
* Vc::malloc.
*/
enum MallocAlignment {
/**
* Align on boundary of vector sizes (e.g. 16 Bytes on SSE platforms) and pad to allow
* vector access to the end. Thus the allocated memory contains a multiple of
* VectorAlignment bytes.
*/
AlignOnVector,
/**
* Align on boundary of cache line sizes (e.g. 64 Bytes on x86) and pad to allow
* full cache line access to the end. Thus the allocated memory contains a multiple of
* 64 bytes.
*/
AlignOnCacheline,
/**
* Align on boundary of page sizes (e.g. 4096 Bytes on x86) and pad to allow
* full page access to the end. Thus the allocated memory contains a multiple of
* 4096 bytes.
*/
AlignOnPage
};
/**
* \ingroup Utilities
*
* Enum to identify a certain SIMD instruction set.
*
* You can use \ref CurrentImplementation for the currently active implementation.
*
* \see ExtraInstructions
*/
enum Implementation : std::uint_least32_t { // TODO: make enum class
/// uses only fundamental types
ScalarImpl,
/// x86 SSE + SSE2
SSE2Impl,
/// x86 SSE + SSE2 + SSE3
SSE3Impl,
/// x86 SSE + SSE2 + SSE3 + SSSE3
SSSE3Impl,
/// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1
SSE41Impl,
/// x86 SSE + SSE2 + SSE3 + SSSE3 + SSE4.1 + SSE4.2
SSE42Impl,
/// x86 AVX
AVXImpl,
/// x86 AVX + AVX2
AVX2Impl,
/// Intel Xeon Phi
MICImpl,
ImplementationMask = 0xfff
};
/**
* \ingroup Utilities
*
* The list of available instructions is not easily described by a linear list of instruction sets.
* On x86 the following instruction sets always include their predecessors:
* SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2
*
* But there are additional instructions that are not necessarily required by this list. These are
* covered in this enum.
*/
enum ExtraInstructions : std::uint_least32_t { // TODO: make enum class
//! Support for float16 conversions in hardware
Float16cInstructions = 0x01000,
//! Support for FMA4 instructions
Fma4Instructions = 0x02000,
//! Support for XOP instructions
XopInstructions = 0x04000,
//! Support for the population count instruction
PopcntInstructions = 0x08000,
//! Support for SSE4a instructions
Sse4aInstructions = 0x10000,
//! Support for FMA instructions (3 operand variant)
FmaInstructions = 0x20000,
//! Support for ternary instruction coding (VEX)
VexInstructions = 0x40000,
//! Support for BMI2 instructions
Bmi2Instructions = 0x80000,
// PclmulqdqInstructions,
// AesInstructions,
// RdrandInstructions
ExtraInstructionsMask = 0xfffff000u
};
/**
* \ingroup Utilities
* This class identifies the specific implementation %Vc uses in the current translation
* unit in terms of a type.
*
* Most importantantly, the type \ref CurrentImplementation instantiates the class
* template with the bitmask identifying the current implementation. The contents of the
* bitmask can be queried with the static member functions of the class.
*/
template <unsigned int Features> struct ImplementationT {
/// Returns the currently used Vc::Implementation.
static constexpr Implementation current()
{
return static_cast<Implementation>(Features & ImplementationMask);
}
/// Returns whether \p impl is the current Vc::Implementation.
static constexpr bool is(Implementation impl)
{
return static_cast<unsigned int>(impl) == current();
}
/**
* Returns whether the current Vc::Implementation implements at least \p low and at
* most \p high.
*/
static constexpr bool is_between(Implementation low, Implementation high)
{
return static_cast<unsigned int>(low) <= current() &&
static_cast<unsigned int>(high) >= current();
}
/**
* Returns whether the current code would run on a CPU providing \p extraInstructions.
*/
static constexpr bool runs_on(unsigned int extraInstructions)
{
return (extraInstructions & Features & ExtraInstructionsMask) ==
(Features & ExtraInstructionsMask);
}
};
/**
* \ingroup Utilities
* Identifies the %Vc implementation used in the current translation unit.
*
* \see ImplementationT
*/
using CurrentImplementation = ImplementationT<
#ifdef Vc_IMPL_Scalar
ScalarImpl
#elif defined(Vc_IMPL_MIC)
MICImpl
#elif defined(Vc_IMPL_AVX2)
AVX2Impl
#elif defined(Vc_IMPL_AVX)
AVXImpl
#elif defined(Vc_IMPL_SSE4_2)
SSE42Impl
#elif defined(Vc_IMPL_SSE4_1)
SSE41Impl
#elif defined(Vc_IMPL_SSSE3)
SSSE3Impl
#elif defined(Vc_IMPL_SSE3)
SSE3Impl
#elif defined(Vc_IMPL_SSE2)
SSE2Impl
#endif
#ifdef Vc_IMPL_SSE4a
+ Vc::Sse4aInstructions
#ifdef Vc_IMPL_XOP
+ Vc::XopInstructions
#ifdef Vc_IMPL_FMA4
+ Vc::Fma4Instructions
#endif
#endif
#endif
#ifdef Vc_IMPL_POPCNT
+ Vc::PopcntInstructions
#endif
#ifdef Vc_IMPL_FMA
+ Vc::FmaInstructions
#endif
#ifdef Vc_IMPL_BMI2
+ Vc::Bmi2Instructions
#endif
#ifdef Vc_USE_VEX_CODING
+ Vc::VexInstructions
#endif
>;
} // namespace Vc
#include "version.h"
#endif // VC_GLOBAL_H_
// vim: foldmethod=marker