Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Apr 28, 2024
1 parent cacf762 commit 74d4f73
Showing 1 changed file with 12 additions and 12 deletions.
24 changes: 12 additions & 12 deletions src/layer/arm/gru_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,8 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 7 < size; i += 8)
{
int32x2_t _xi01 = vreinterpret_s32_s8(vld1_s8(x + i));
int8x16_t _xi0 = vreinterpretq_s8_s16(vdupq_lane_s32(_xi01, 0));
int8x16_t _xi1 = vreinterpretq_s8_s16(vdupq_lane_s32(_xi01, 1));
int8x16_t _xi0 = vreinterpretq_s8_s32(vdupq_lane_s32(_xi01, 0));
int8x16_t _xi1 = vreinterpretq_s8_s32(vdupq_lane_s32(_xi01, 1));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
int8x16_t _w2 = vld1q_s8(kptr + 32);
Expand All @@ -561,7 +561,7 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 3 < size; i += 4)
{
#if __ARM_FEATURE_DOTPROD
int8x16_t _xi = vreinterpretq_s8_s16(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(x + i)), 0));
int8x16_t _xi = vreinterpretq_s8_s32(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(x + i)), 0));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
_gru_Rx0 = vdotq_s32(_gru_Rx0, _w0, _xi);
Expand Down Expand Up @@ -618,8 +618,8 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 7 < num_output; i += 8)
{
int32x2_t _h_cont01 = vreinterpret_s32_s8(vld1_s8(hs + i));
int8x16_t _h_cont0 = vreinterpretq_s8_s16(vdupq_lane_s32(_h_cont01, 0));
int8x16_t _h_cont1 = vreinterpretq_s8_s16(vdupq_lane_s32(_h_cont01, 1));
int8x16_t _h_cont0 = vreinterpretq_s8_s32(vdupq_lane_s32(_h_cont01, 0));
int8x16_t _h_cont1 = vreinterpretq_s8_s32(vdupq_lane_s32(_h_cont01, 1));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
int8x16_t _w2 = vld1q_s8(kptr + 32);
Expand All @@ -637,7 +637,7 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 3 < num_output; i += 4)
{
#if __ARM_FEATURE_DOTPROD
int8x16_t _h_cont = vreinterpretq_s8_s16(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(hs + i)), 0));
int8x16_t _h_cont = vreinterpretq_s8_s32(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(hs + i)), 0));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
_gru_Rh0 = vdotq_s32(_gru_Rh0, _w0, _h_cont);
Expand Down Expand Up @@ -717,8 +717,8 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 7 < num_output; i += 8)
{
int32x2_t _h_cont01 = vreinterpret_s32_s8(vld1_s8(hs + i));
int8x16_t _h_cont0 = vreinterpretq_s8_s16(vdupq_lane_s32(_h_cont01, 0));
int8x16_t _h_cont1 = vreinterpretq_s8_s16(vdupq_lane_s32(_h_cont01, 1));
int8x16_t _h_cont0 = vreinterpretq_s8_s32(vdupq_lane_s32(_h_cont01, 0));
int8x16_t _h_cont1 = vreinterpretq_s8_s32(vdupq_lane_s32(_h_cont01, 1));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
_gru_Nh0 = vdotq_s32(_gru_Nh0, _w0, _h_cont0);
Expand All @@ -731,7 +731,7 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 3 < num_output; i += 4)
{
#if __ARM_FEATURE_DOTPROD
int8x16_t _h_cont = vreinterpretq_s8_s16(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(hs + i)), 0));
int8x16_t _h_cont = vreinterpretq_s8_s32(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(hs + i)), 0));
int8x16_t _w = vld1q_s8(kptr);
_gru_Nh0 = vdotq_s32(_gru_Nh0, _w, _h_cont);
#else
Expand Down Expand Up @@ -775,8 +775,8 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 7 < size; i += 8)
{
int32x2_t _xi01 = vreinterpret_s32_s8(vld1_s8(x + i));
int8x16_t _xi0 = vreinterpretq_s8_s16(vdupq_lane_s32(_xi01, 0));
int8x16_t _xi1 = vreinterpretq_s8_s16(vdupq_lane_s32(_xi01, 1));
int8x16_t _xi0 = vreinterpretq_s8_s32(vdupq_lane_s32(_xi01, 0));
int8x16_t _xi1 = vreinterpretq_s8_s32(vdupq_lane_s32(_xi01, 1));
int8x16_t _w0 = vld1q_s8(kptr);
int8x16_t _w1 = vld1q_s8(kptr + 16);
_gru_Nx0 = vdotq_s32(_gru_Nx0, _w0, _xi0);
Expand All @@ -789,7 +789,7 @@ static void gru_int8(const Mat& bottom_blob_int8, const Mat& bottom_blob_int8_de
for (; i + 3 < size; i += 4)
{
#if __ARM_FEATURE_DOTPROD
int8x16_t _xi = vreinterpretq_s8_s16(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(x + i)), 0));
int8x16_t _xi = vreinterpretq_s8_s32(vdupq_lane_s32(vreinterpret_s32_s8(vld1_s8(x + i)), 0));
int8x16_t _w = vld1q_s8(kptr);
_gru_Nx0 = vdotq_s32(_gru_Nx0, _w, _xi);
#else
Expand Down

0 comments on commit 74d4f73

Please sign in to comment.