Skip to content

Commit

Permalink
Merge branch 'master' into armv7-vfpv4-gemm
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Feb 2, 2024
2 parents 7701506 + 5b536af commit 69cbe65
Show file tree
Hide file tree
Showing 3 changed files with 267 additions and 192 deletions.
81 changes: 78 additions & 3 deletions src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@
#include <sys/system_properties.h> // __system_property_get()
#include <dlfcn.h>
#endif
#include <ctype.h>
#include <stdint.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <unistd.h>
#endif
Expand All @@ -69,6 +72,7 @@
#include <mach/thread_act.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#include <unistd.h>
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#define __IOS__ 1
Expand Down Expand Up @@ -181,7 +185,69 @@ static int g_cpu_is_arm_a53_a55;
#endif // __aarch64__
#endif // defined __ANDROID__ || defined __linux__

static bool g_is_being_debugged = false;
static bool is_being_debugged()
{
#if defined _WIN32
return IsDebuggerPresent();
#elif defined __ANDROID__ || defined __linux__
// https://stackoverflow.com/questions/3596781/how-to-detect-if-the-current-process-is-being-run-by-gdb
int status_fd = open("/proc/self/status", O_RDONLY);
if (status_fd == -1)
return false;

char buf[4096];
ssize_t num_read = read(status_fd, buf, sizeof(buf) - 1);
close(status_fd);

if (num_read <= 0)
return false;

buf[num_read] = '\0';
const char tracerPidString[] = "TracerPid:";
const char* tracer_pid_ptr = strstr(buf, tracerPidString);
if (!tracer_pid_ptr)
return false;

for (const char* ch = tracer_pid_ptr + sizeof(tracerPidString) - 1; ch <= buf + num_read; ++ch)
{
if (isspace(*ch))
continue;

return isdigit(*ch) != 0 && *ch != '0';
}

return false;
#elif defined __APPLE__
// https://stackoverflow.com/questions/2200277/detecting-debugger-on-mac-os-x
struct kinfo_proc info;
info.kp_proc.p_flag = 0;

int mib[4];
mib[0] = CTL_KERN;
mib[1] = KERN_PROC;
mib[2] = KERN_PROC_PID;
mib[3] = getpid();

size_t size = sizeof(info);
sysctl(mib, sizeof(mib) / sizeof(*mib), &info, &size, NULL, 0);

return ((info.kp_proc.p_flag & P_TRACED) != 0);
#else
// unknown platform :(
fprintf(stderr, "unknown platform!\n");
return false;
#endif
}

#if defined _WIN32
#if WINAPI_FAMILY == WINAPI_FAMILY_APP
static int detectisa(const void* /*some_inst*/)
{
// uwp does not support seh :(
return 0;
}
#else // WINAPI_FAMILY == WINAPI_FAMILY_APP
static int g_sigill_caught = 0;
static jmp_buf g_jmpbuf;

Expand All @@ -198,6 +264,9 @@ static LONG CALLBACK catch_sigill(struct _EXCEPTION_POINTERS* ExceptionInfo)

static int detectisa(const void* some_inst)
{
if (g_is_being_debugged)
return 0;

g_sigill_caught = 0;

PVOID eh = AddVectoredExceptionHandler(1, catch_sigill);
Expand All @@ -211,6 +280,7 @@ static int detectisa(const void* some_inst)

return g_sigill_caught ? 0 : 1;
}
#endif // WINAPI_FAMILY == WINAPI_FAMILY_APP

#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#ifdef _MSC_VER
Expand Down Expand Up @@ -244,6 +314,9 @@ static void catch_sigill(int /*signo*/, siginfo_t* /*si*/, void* /*data*/)

static int detectisa(void (*some_inst)())
{
if (g_is_being_debugged)
return 0;

g_sigill_caught = 0;

struct sigaction sa;
Expand Down Expand Up @@ -1962,11 +2035,13 @@ static void initialize_global_cpu_info()
g_powersave = 0;
initialize_cpu_thread_affinity_mask(g_cpu_affinity_mask_all, g_cpu_affinity_mask_little, g_cpu_affinity_mask_big);

g_is_being_debugged = is_being_debugged();

#if defined _WIN32
#if __aarch64__
g_cpu_support_arm_cpuid = detectisa(some_cpuid);
g_cpu_support_arm_asimdhp = detectisa(some_asimdhp);
g_cpu_support_arm_asimddp = detectisa(some_asimddp);
g_cpu_support_arm_asimdhp = detectisa(some_asimdhp) || IsProcessorFeaturePresent(43); // dp implies hp
g_cpu_support_arm_asimddp = detectisa(some_asimddp) || IsProcessorFeaturePresent(43); // 43 is PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
g_cpu_support_arm_asimdfhm = detectisa(some_asimdfhm);
g_cpu_support_arm_bf16 = detectisa(some_bf16);
g_cpu_support_arm_i8mm = detectisa(some_i8mm);
Expand All @@ -1977,7 +2052,7 @@ static void initialize_global_cpu_info()
g_cpu_support_arm_svef32mm = detectisa(some_svef32mm);
#elif __arm__
g_cpu_support_arm_edsp = detectisa(some_edsp);
g_cpu_support_arm_neon = detectisa(some_neon);
g_cpu_support_arm_neon = 1; // all modern windows arm devices have neon
g_cpu_support_arm_vfpv4 = detectisa(some_vfpv4);
#endif // __aarch64__ || __arm__
#elif defined __ANDROID__ || defined __linux__
Expand Down
60 changes: 30 additions & 30 deletions src/layer/x86/convolution_3x3_winograd_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -3544,10 +3544,10 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
_r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
_r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
}
}

Expand Down Expand Up @@ -3653,28 +3653,28 @@ static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bot
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
_r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
_r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
__m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
__m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
_r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
if (tj * 2 + 1 < w)
{
__m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
__m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
_r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
}
if (tj * 2 + 2 < w)
{
__m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
__m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
_r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
}
if (tj * 2 + 3 < w)
{
__m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
__m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
_r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
}
#endif // __AVX512F__
Expand Down Expand Up @@ -4768,12 +4768,12 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
{
__m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
_vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
_r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), sizeof(signed char))));
if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), sizeof(signed char))));
_r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, 1)));
if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), 1)));
if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), 1)));
if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), 1)));
if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), 1)));
if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), 1)));
}
}

Expand Down Expand Up @@ -4919,40 +4919,40 @@ static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bot
__m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
_vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
#if __AVX512F__
_r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char))));
if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char))));
_r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1)));
if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1)));
if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1)));
if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1)));
if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1)));
if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1)));
#else
__m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
__m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
__m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
__m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, 1), _sindex88);
_r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
if (tj * 4 + 1 < w)
{
__m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
__m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, 1), _sindex88);
_r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
}
if (tj * 4 + 2 < w)
{
__m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
__m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, 1), _sindex88);
_r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
}
if (tj * 4 + 3 < w)
{
__m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
__m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, 1), _sindex88);
_r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
}
if (tj * 4 + 4 < w)
{
__m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char)), _sindex88);
__m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, 1), _sindex88);
_r4 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val4_32, 0), _mm256_extracti128_si256(_val4_32, 1)));
}
if (tj * 4 + 5 < w)
{
__m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char)), _sindex88);
__m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, 1), _sindex88);
_r5 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val5_32, 0), _mm256_extracti128_si256(_val5_32, 1)));
}
#endif // __AVX512F__
Expand Down

0 comments on commit 69cbe65

Please sign in to comment.