<a href="https://colab.research.google.com/github/Rinoahu/test_copilot/blob/main/test_omp_simd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
apt update &> log.txt && apt upgrade &> log.txt && apt install libomp-dev &> log.txt

In [None]:
%%file test.c
#include <omp.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <immintrin.h>

int data_dep(int8_t *A, int N){
  int off = 32;
  #pragma omp simd simdlen(8)
  for(int i = off; i < N; i++)
  {
      A[i] += (A[i-off] > 0) * (A[i-off] - A[i]);
  }
  return 1;
}

int is_any_zero(int8_t *array, int size) {
  int result = 0; // initialize result to false
  printf("before result: %d\n", result);
  #pragma omp simd reduction(|:result) // use logical OR reduction
  for (int i = 0; i < size; i++) {
    result = result | (array [i] == 1); // set result to true if any element is zero
  }
  printf("after result: %d\n", result);

  return result;
}

int is_any_zero_0(int8_t *array, int size) {
    for (int i = 0; i < size; i++) {
        if(array [i] == 1){
            return 1;
        }
    }
  return 0;
}


//#pragma omp declare simd inbranch
int is_any_zero_1(int8_t *A, int N) {
  int i, off=32, result = 0; // initialize result to false
  printf("before result: %d\n", result);

  //#pragma omp simd
  for (i=0; i<N; i+=off) {
   
    //#pragma omp simd simdlen(32), reduction(|:tmp) //, aligned(A:32) // use logical OR reduction
    //#pragma omp simd reduction(|:result) aligned(A:64)
    #pragma omp simd simdlen(32), reduction(|:result) //, aligned(A:32) // use logical OR reduction
    for(int j=i; j<i+off; j++){
        result |= (A[i] == -1); // set result to true if any element is zero
    }
    result |= result;
    if(result != 0){
        //return result;
        break;
    }
  }

  printf("after result: %d %d\n", result, i);

  return result;
}

int is_any_zero_2(int8_t *array, int size) {
  int i, flag, result = 0; // initialize result to false
  printf("before result: %d\n", result);
  #pragma omp simd reduction(|:result) // use logical OR reduction
  for (i = 0; i < size; i++) {
    result = result | (array [i] == 1); // set result to true if any element is zero
    if(result != 0)
    {
        flag = 1;
    }
  }
  printf("after result: %d %d\n", result, i);

  return result;
}

//#pragma omp declare simd inbranch
int is_any_zero_3(int8_t *A, int N) {
  int i, off=64, result = 0; // initialize result to false
  printf("before result: %d\n", result);

  //#pragma omp simd
  for (i=0; i<N; i+=off) {
    // Use pragma omp simd with additional clauses
    #pragma omp simd simdlen(32), reduction(|:result), aligned(A:32), safelen(64)
    for(int j=i; j<i+off; j++){
        result |= (A[i] == -1); // set result to true if any element is zero
    }
    if(result != 0){
        //return result;
        break;
    }
  }

  printf("after result: %d %d\n", result, i);

  return result;
}

int is_any_zero_4(int8_t *A, int N) {
  int i, off=64, result = 0; // initialize result to false
  __m128i vresult, va, vcmp, flag=_mm_set1_epi8(-1);
  printf("before result: %d\n", result);

  for (i=0; i<N; i+=off) {
    vresult = _mm_set1_epi8(0); // initialize vresult to false

    for(int j=i; j<i+off; j+=16){
        //__m128i va = _mm_load_si128((__m128i const *)(A+j));
        va = _mm_load_si128((__m128i const *)(A+j));
        //__m128i vcmp = _mm_cmpeq_epi8(va, _mm_set1_epi8(-1));
        vcmp = _mm_cmpeq_epi8(va, flag);
        vresult = _mm_or_si128(vresult, vcmp);
    }

    result |= _mm_movemask_epi8(vresult); // set result to true if any element is -1

    if(result != 0){
        break;
    }
  }

  printf("after result: %d %d\n", result, i);

  return result;
}

int is_any_zero_5(int8_t *A, int N) {
  int i, off=32, result = 0; // initialize result to false
  printf("before result: %d\n", result);

  for (i=0; i<N; i+=off) {
    int partial_result = 0; // initialize partial result to false

    #pragma omp simd reduction(|:partial_result) aligned(A:64)
    for(int j=i; j<i+off; j++){
      partial_result |= (A[j] == -1); // set partial result to true if any element is -1
    }

    result |= partial_result; // set result to true if any element is -1
    if(result != 0) { // terminate outer loop if result is non-zero
      break;
    }
  }

  printf("after result: %d %d\n", result, i);
  return result;
}

//int is_any_zero_6(int8_t *A, int N) {
int test_6(int8_t *A, int N) {
  int i, result = 0; // initialize result to false
  __m128i va, vcmp, flag=_mm_set1_epi8(-1), vresult = _mm_set1_epi8(0); 
  printf("before result: %d\n", result);

  for(i=0; i<N; i+=16){
    va = _mm_load_si128((__m128i const *)(A+i));
    vcmp = _mm_cmpeq_epi8(va, flag);
    vresult = _mm_or_si128(vresult, vcmp);
    result |= _mm_movemask_epi8(vresult); // set result to true if any element is -1  
    if(result != 0){
        break;
    }
  }

  printf("after result: %d %d\n", result, i);

  return result;
}

int test_7(int8_t *A, int N) {
    int i, j, flag = 0;
    __m256i vecA, cmpr, minus_one = _mm256_set1_epi8(-1);

    for (i = 0; i < N; i += 32) {
        vecA = _mm256_loadu_si256((__m256i *)(A + i));
        cmpr = _mm256_cmpeq_epi8(vecA, minus_one);
        flag = _mm256_movemask_epi8(cmpr);
        if (flag != 0) {
            break;
        }
        j+=1;
    }

    printf("after result: %d %d %d\n", flag, i, j);
    return flag;
}


int simd_set_examples(){
    // Declare an array of 32 integers
    int arr[32];
    // Create a vector with all elements set to 42
    __m128i vec = _mm_set1_epi32(42);
    // Store the vector into the array using a loop
    for (int i = 4; i < 32; i += 4) {
        // Cast the array pointer to __m128i*
        //_mm_store_si128((__m128i*)&arr[i], _mm_set1_epi32(42));
        _mm_store_si128((__m128i*)&arr[i], _mm_setr_epi32(1,2,3,4));
    }

    int array[64] = { /* some values */ };
    int indices[4] = {1, 34, 45, 55}; // the positions to fetch
    __m256i vindex = _mm256_loadu_si256((__m256i*)indices); // load index vector
    __m256i vresult = _mm256_i32gather_epi32(array, vindex, 4); // gather values with scale factor of 4


    //use intrinsic avx2 functions to set the values of index 1, 4, 8, 19 in an int array to 3,2,3,4 in C code
    __m256i avx_arr = _mm256_loadu_si256((__m256i*)arr);

    __m256i indices = _mm256_set_epi32(19, 8, 4, 1, 0, 0, 0, 0);
    __m256i values = _mm256_set_epi32(4, 3, 2, 1, 0, 0, 0, 0);

    avx_arr = _mm256_blendv_epi8(avx_arr, values, _mm256_cmpeq_epi32(indices, _mm256_set_epi32(19, 8, 4, 1, -1, -1 ,-1 ,-1)));

    //use intrinsic avx2 functions to set the values of index 1, 19, 4, 7 in an int array to 1,2,3,4 in C code
    __m256i indices = _mm256_set_epi32(7, 4, 19, 1, 0, 0, 0, 0);
    __m256i values = _mm256_set_epi32(4, 3, 2, 1, 0, 0, 0, 0);
    _mm256_i32scatter_epi32(arr, indices, values, sizeof(int));

    return 0;
}

int main() {

  int N = 1280000000; // size of the array
  int8_t *A = aligned_alloc(32, N * sizeof(int8_t)); // the array to check
  int k; // flag to indicate if all elements are nonzero

  clock_t s0, e0;

  // initialize the array with some values
  #pragma omp simd
  for (int i = 0; i < N; i++) {
    A[i] = rand() % 0xf;
  }

  printf("%ld %d\n", CLOCKS_PER_SEC, N);

  s0 = clock(); // get the start time
  is_any_zero_1(A, N);
  //k = data_dep(A, N);
  e0 = clock(); // get the start time
  printf("fuc 1 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  is_any_zero_3(A, N);
  e0 = clock(); // get the start time
  printf("fuc 3 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  is_any_zero_4(A, N);
  e0 = clock(); // get the start time
  printf("fuc 4 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  is_any_zero_5(A, N);
  e0 = clock(); // get the start time
  printf("fuc 5 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  //is_any_zero_6(A, N);
  test_6(A, N);
  e0 = clock(); // get the start time
  printf("fuc 6 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  //is_any_zero_6(A, N);
  test_7(A, N);
  e0 = clock(); // get the start time
  printf("fuc 7 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  return 0;
}

Writing test.c


In [None]:
%%bash
#apt update && apt upgrade && apt install libomp-dev

#gcc -mavx2 -O2 -o test test.c && objdump -s -d ./test > test.log && ./test
gcc -mavx2 -fopenmp -O2 -o test test.c && objdump -s -d ./test > test.log && ./test
#clang -mavx2 -fopenmp -O2 -o test test.c && objdump -s -d ./test > test.log && ./test
#clang -mavx2 -O2 -o test test.c && objdump -s -d ./test > test.log && ./test

In [None]:
%%file test.c
#include <omp.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <immintrin.h>

int is_any_zero_0(int8_t *array, int size) {
    int i, flag=0;
    for (i = 0; i < size; i++) {
        /*
        if(array [i] == 0){
            return 1;
        }
        */
        flag = (array[i] == 0);
    }
    printf("i is %d\n", i);
    return flag;
}

int is_any_zero_1(int8_t *array, int size) {
    __m128i zero = _mm_set1_epi8(-1);
    int i;
    for (i = 0; i < size; i += 16) {
        __m128i data = _mm_loadu_si128((__m128i *)(array + i));
        __m128i cmp = _mm_cmpeq_epi8(data, zero);
        int mask = _mm_movemask_epi8(cmp);
        if (mask != 0) {
            return 1;
        }
    }
    printf("i is %d\n", i);
    return 0;
}
int is_any_zero_2(int8_t *array, int size) {
    __m256i zero = _mm256_set1_epi8(-1);
    int i;
    for (i = 0; i < size; i += 32) {
        __m256i data = _mm256_loadu_si256((__m256i *)(array + i));
        __m256i cmp = _mm256_cmpeq_epi8(data, zero);
        int mask = _mm256_movemask_epi8(cmp);
        if (mask != 0) {
            return 1;
        }
    }
    printf("i is %d\n", i);
    return 0;
}

int is_any_zero_3(int8_t *array, int size) {
    int i, result = 0;
    //#pragma omp simd reduction(|:result)
    for (i=0; i<size; i++) {
        result  = (array[i] == 0);
    }
    printf("i is %d\n", i);
    return result;
}

int main() {

  int N = 1280000000; // size of the array
  int8_t *A = aligned_alloc(32, N * sizeof(int8_t)); // the array to check
  int k; // flag to indicate if all elements are nonzero

  clock_t s0, e0;

  // initialize the array with some values
  #pragma omp simd
  for (int i = 0; i < N; i++) {
    A[i] = 1 + rand() % 0xf;
  }

  printf("%ld %d\n", CLOCKS_PER_SEC, N);

  s0 = clock(); // get the start time
  is_any_zero_0(A, N);
  e0 = clock(); // get the start time
  printf("fuc 0 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  is_any_zero_1(A, N);
  e0 = clock(); // get the start time
  printf("fuc 1 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

  s0 = clock(); // get the start time
  is_any_zero_2(A, N);
  e0 = clock(); // get the start time
  printf("fuc 2 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

    s0 = clock(); // get the start time
    is_any_zero_3(A, N);
    e0 = clock(); // get the start time
    printf("fuc 3 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);
    return 0;
}

Overwriting test.c


In [None]:
%%bash
#apt update && apt upgrade && apt install libomp-dev

gcc  -mavx2 -O2 -o test test.c && objdump -s -d ./test > test.log && ./test
#clang -mavx2 -fopenmp -O2 -o test test.c && objdump -s -d ./test > test.log && ./test

In [None]:
%%bash
clang -mavx2 -O1 -o test test.c && ./test

In [None]:
%%file test.c
#include <omp.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <immintrin.h> // for AVX2 intrinsics

/*
// Assume the input vector is aligned on a 32-byte boundary
int find_index(__m256i input, int64_t element) {
  // Create a vector that contains the target element in each slot
  __m256i target = _mm256_set1_epi64x(element);

  // Compare the target vector with the input vector and get a mask
  __m256i mask = _mm256_cmpeq_epi64(target, input);

  // Count the number of 1s in the mask and get the index
  int index = _mm_popcnt_u32(_mm256_movemask_pd(_mm256_castsi256_pd(mask)));

  // Return -1 if no match is found, or the index otherwise
  return index == 0 ? -1 : index - 1;
}

int find_index0(__m256i input, int64_t element) {
    int* base_addr = input;
    __m256i indices = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);

    // Define a scale factor
    int scale = 4;

    // Gather 4 integers from memory using the base address, indices, and scale
    __m256i result = _mm256_i32gather_epi32(base_addr, indices, scale);

    // Store the result in the output array
    _mm256_store_si256((__m256i*)output, result);
}
*/


int fuc_0(int8_t *A, int N, int bw)
{
    #pragma omp simd
    for(int i=bw; i<N; i++)
    {
        A[i] = (A[i] > A[i-bw] - 1 ? A[i]: A[i-bw] - 1);
    }
 
    printf("1 < 2: %d\n", 1>2?1:2);
    return 0;
}

int main() {

  int N = 1280000000; // size of the array
  //int N = 128; // size of the array
  int8_t *A = aligned_alloc(32, N * sizeof(int8_t)); // the array to check
  int k; // flag to indicate if all elements are nonzero

  clock_t s0, e0;

  // initialize the array with some values
  #pragma omp simd
  for (int i = 0; i < N; i++) {
    A[i] = 1 + rand() % 0xf;
  }
  
  printf("%ld %d\n", CLOCKS_PER_SEC, N);

  s0 = clock(); // get the start time
  for(int i = 0; i < 6; i++)
  {
    fuc_0(A, N, 16);
  }
  e0 = clock(); // get the start time
  printf("fuc 0 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

}

Overwriting test.c


In [None]:
%%bash
#clang -mavx2 -O3 -o test test.c && objdump -s -d ./test > test.log && ./test
#gcc -O3 -o test test.c && objdump -s -d ./test > test.log && ./test
gcc -fopenmp -O2 -o test test.c && objdump -s -d ./test > test.log && ./test

#clang -O2 -o test test.c && objdump -s -d ./test > test.log && ./test
#clang -fopenmp -O2 -o test test.c && objdump -s -d ./test > test.log && ./test

In [47]:
%%file test.c
#include <omp.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <immintrin.h> // for AVX2 intrinsics

// test the avx2 intrinsic function for 256-bit gather operation
int main() {
    int indices[8] = {0, 1, 999999, 1123, 112234, 5, 6, 27};
    int values[1000000] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32};
    int output[8];
    int N = 100000000;
    clock_t s0, e0;

    s0 = clock(); // get the start time
    for(int i = 0; i < N; i++)
    {
        __m256i index_vec = _mm256_loadu_si256((__m256i*)indices); // Load the indices into a 256-bit vector
        __m256i result_vec = _mm256_i32gather_epi32(values, index_vec, 4); // Gather the values using the indices
        _mm256_storeu_si256((__m256i*)output, result_vec); // Store the result in the output array
    }
    e0 = clock(); // get the start time
    printf("fuc 0 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);


   s0 = clock(); // get the start time
    for(int i = 0; i < N; i++)
    {
        for(int j=0; j<8; j++)
        {
            output[j] = values[indices[j]];          
        }
    }
    e0 = clock(); // get the start time
    printf("fuc 1 %ld %ld The simd took %f seconds to execute. %d\n", s0, e0, (double) (e0 - s0)/1000000, N);

    // Output should be: [0, 1, 171, 123123, 112234, 5, 6, 27]
    for (int i = 0; i < 8; i++) {
        printf("idx:%d val:%d\n", indices[i], output[i]);
    }
    printf("\n");

    return 0;
}


Overwriting test.c


In [None]:
%%bash
gcc -O2 -mavx2 -o test test.c && objdump -s -d ./test > test.log && ./test