## Reduction

This section covers ways to perform reductions in parallel, task, taskloop, and SIMD regions.

### The `reduction` Clause

The following example demonstrates the `reduction` clause; note that some  reductions can be expressed in the loop in several ways, as shown for the `max`  and `min` reductions below:

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.1c
* type: C
* version: omp_3.1
*/
#include <math.h>
void reduction1(float *x, int *y, int n)
{
  int i, b, c;
  float a, d;
  a = 0.0;
  b = 0;
  c = y[0];
  d = x[0];
  #pragma omp parallel for private(i) shared(x, y, n) \
                          reduction(+:a) reduction(^:b) \
                          reduction(min:c) reduction(max:d)
    for (i=0; i<n; i++) {
      a += x[i];
      b ^= y[i];
      if (c > y[i]) c = y[i];
      d = fmaxf(d,x[i]);
    }
}



In [None]:

! name: reduction.1f
! type: F-free
SUBROUTINE REDUCTION1(A, B, C, D, X, Y, N)
    REAL :: X(*), A, D
    INTEGER :: Y(*), N, B, C
    INTEGER :: I
    A = 0
    B = 0
    C = Y(1)
    D = X(1)
    !$OMP PARALLEL DO PRIVATE(I) SHARED(X, Y, N) REDUCTION(+:A) &
    !$OMP& REDUCTION(IEOR:B) REDUCTION(MIN:C)  REDUCTION(MAX:D)
      DO I=1,N
        A = A + X(I)
        B = IEOR(B, Y(I))
        C = MIN(C, Y(I))
        IF (D < X(I)) D = X(I)
      END DO

END SUBROUTINE REDUCTION1



A common implementation of the preceding example is to treat it as if it had been  written as follows:

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.2c
* type: C
*/
#include <limits.h>
#include <math.h>
void reduction2(float *x, int *y, int n)
{
  int i, b, b_p, c, c_p;
  float a, a_p, d, d_p;
  a = 0.0f;
  b = 0;
  c = y[0];
  d = x[0];
  #pragma omp parallel shared(a, b, c, d, x, y, n) \
                          private(a_p, b_p, c_p, d_p)
  {
    a_p = 0.0f;
    b_p = 0;
    c_p = INT_MAX;
    d_p = -HUGE_VALF;
    #pragma omp for private(i)
    for (i=0; i<n; i++) {
      a_p += x[i];
      b_p ^= y[i];
      if (c_p > y[i]) c_p = y[i];
      d_p = fmaxf(d_p,x[i]);
    }
    #pragma omp critical
    {
      a += a_p;
      b ^= b_p;
      if( c > c_p ) c = c_p;
      d = fmaxf(d,d_p);
    }
  }
}



In [None]:

! name: reduction.2f
! type: F-free
  SUBROUTINE REDUCTION2(A, B, C, D, X, Y, N)
    REAL :: X(*), A, D
    INTEGER :: Y(*), N, B, C
    REAL :: A_P, D_P
    INTEGER :: I, B_P, C_P
    A = 0
    B = 0
    C = Y(1)
    D = X(1)
    !$OMP PARALLEL SHARED(X, Y, A, B, C, D, N) &
    !$OMP&         PRIVATE(A_P, B_P, C_P, D_P)
      A_P = 0.0
      B_P = 0
      C_P = HUGE(C_P)
      D_P = -HUGE(D_P)
      !$OMP DO PRIVATE(I)
      DO I=1,N
        A_P = A_P + X(I)
        B_P = IEOR(B_P, Y(I))
        C_P = MIN(C_P, Y(I))
        IF (D_P < X(I)) D_P = X(I)
      END DO
      !$OMP CRITICAL
        A = A + A_P
        B = IEOR(B, B_P)
        C = MIN(C, C_P)
        D = MAX(D, D_P)
      !$OMP END CRITICAL
    !$OMP END PARALLEL
  END SUBROUTINE REDUCTION2



The following program is non-conforming because the reduction is on the  **intrinsic procedure name** `MAX` but that name has been redefined to be the variable  named `MAX`.

In [None]:

! name: reduction.3f
! type: F-free
 PROGRAM REDUCTION_WRONG
 MAX = HUGE(0)
 M = 0

 !$OMP PARALLEL DO REDUCTION(MAX: M)
! MAX is no longer the intrinsic so this is non-conforming
 DO I = 1, 100
    CALL SUB(M,I)
 END DO

 END PROGRAM REDUCTION_WRONG

 SUBROUTINE SUB(M,I)
    M = MAX(M,I)
 END SUBROUTINE SUB



The following conforming program performs the reduction using the  **intrinsic procedure name** `MAX` even though the intrinsic `MAX` has been renamed  to `REN`.

In [None]:

! name: reduction.4f
! type: F-free
MODULE M
   INTRINSIC MAX
END MODULE M

PROGRAM REDUCTION3
   USE M, REN => MAX
   N = 0
!$OMP PARALLEL DO REDUCTION(REN: N)     ! still does MAX
   DO I = 1, 100
      N = MAX(N,I)
   END DO
END PROGRAM REDUCTION3



The following conforming program performs the reduction using   _intrinsic procedure name_  `MAX` even though the intrinsic `MAX` has been renamed  to `MIN`.

In [None]:

! name: reduction.5f
! type: F-free
MODULE MOD
   INTRINSIC MAX, MIN
END MODULE MOD

PROGRAM REDUCTION4
   USE MOD, MIN=>MAX, MAX=>MIN
   REAL :: R
   R = -HUGE(0.0)

!$OMP PARALLEL DO REDUCTION(MIN: R)     ! still does MAX
   DO I = 1, 1000
      R = MIN(R, SIN(REAL(I)))
   END DO
   PRINT *, R
END PROGRAM REDUCTION4



The following example is non-conforming because the initialization (`a =  0`) of the original list item `a` is not synchronized with the update of  `a` as a result of the reduction computation in the `for` loop. Therefore,  the example may print an incorrect value for `a`.

To avoid this problem, the initialization of the original list item `a`  should complete before any update of `a` as a result of the `reduction`  clause. This can be achieved by adding an explicit barrier after the assignment  `a = 0`, or by enclosing the assignment `a = 0` in a `single`  directive (which has an implied barrier), or by initializing `a` before  the start of the `parallel` region.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.6c
* type: C
*/
#include <stdio.h>

int main (void)
{
  int a, i;

  #pragma omp parallel shared(a) private(i)
  {
    #pragma omp master
    a = 0;

    // To avoid race conditions, add a barrier here.

    #pragma omp for reduction(+:a)
    for (i = 0; i < 10; i++) {
        a += i;
    }

    #pragma omp single
    printf ("Sum is %d\n", a);
  }
  return 0;
}



In [None]:

! name: reduction.6f
! type: F-fixed
      INTEGER A, I

!$OMP PARALLEL SHARED(A) PRIVATE(I)

!$OMP MASTER
      A = 0
!$OMP END MASTER

      ! To avoid race conditions, add a barrier here.

!$OMP DO REDUCTION(+:A)
      DO I= 0, 9
         A = A + I
      END DO

!$OMP SINGLE
      PRINT *, "Sum is ", A
!$OMP END SINGLE

!$OMP END PARALLEL
      END



The following example demonstrates the reduction of array  _a_ .  In C/C++ this is illustrated by the explicit use of an array section  _a[0:N]_  in the `reduction` clause.  The corresponding Fortran example uses array syntax supported in the base language.  As of the OpenMP 4.5 specification the explicit use of array section in the `reduction` clause in Fortran is not permitted.  But this oversight has been fixed in the OpenMP 5.0 specification.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: reduction.7c
* type: C
* version: omp_4.5
*/
#include <stdio.h>

#define N 100
void init(int n, float (*b)[N]);

int main(){

  int i,j;
  float a[N], b[N][N];

  init(N,b);

  for(i=0; i<N; i++) a[i]=0.0e0;

  #pragma omp parallel for reduction(+:a[0:N]) private(j)
  for(i=0; i<N; i++){
    for(j=0; j<N; j++){
       a[j] +=  b[i][j];
    }
  }
  printf(" a[0] a[N-1]: %f %f\n", a[0], a[N-1]);

  return 0;
}



In [None]:

! name: reduction.7f
! type: F-free
program array_red

  integer,parameter :: n=100
  integer           :: j
  real              :: a(n), b(n,n)

  call init(n,b)

  a(:) = 0.0e0

  !$omp parallel do reduction(+:a)
  do j = 1, n
     a(:) = a(:) + b(:,j)
  end do

  print*, " a(1) a(n): ", a(1), a(n)

end program



### Task Reduction

In OpenMP 5.0 the `task_reduction` clause was created for the `taskgroup` construct,  to allow reductions among explicit tasks that have an `in_reduction` clause.

In the  _task_reduction.1_  example below a reduction is performed as the algorithm traverses a linked list. The reduction statement is assigned to be an explicit task using a `task` construct and is specified to be a reduction participant with  the `in_reduction` clause. A `taskgroup` construct encloses the tasks participating in the reduction, and specifies, with the `task_reduction` clause, that the taskgroup has tasks participating in a reduction.  After the `taskgroup` region the original variable will contain  the final value of the reduction.

Note: The  _res_  variable is private in the  _linked_list_sum_  routine and is not required to be shared (as in the case of a `parallel` construct reduction).

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name:       task_reduction.1c
* type:       C
*/

#include<stdlib.h>
#include<stdio.h>
#define N 10

typedef struct node_tag {
    int val;
    struct node_tag *next;
} node_t;

int linked_list_sum(node_t *p)
{
    int res = 0;

    #pragma omp taskgroup task_reduction(+: res)
    {
        node_t* aux = p;
        while(aux != 0)
        {
            #pragma omp task in_reduction(+: res)
            res += aux->val;

            aux = aux->next;
        }
    }
    return res;
}


int main(int argc, char *argv[]) {
    int i;
//                           Create the root node.
    node_t* root = (node_t*) malloc(sizeof(node_t));
    root->val = 1;

    node_t* aux = root;

//                           Create N-1 more nodes.
    for(i=2;i<=N;++i){
        aux->next = (node_t*) malloc(sizeof(node_t));
        aux = aux->next;
        aux->val = i;
    }

    aux->next = 0;

    #pragma omp parallel
    #pragma omp single
    {
        int result = linked_list_sum(root);
        printf( "Calculated: %d  Analytic:%d\n", result, (N*(N+1)/2) );
    }

    return 0;
}




In [None]:

! name:       task_reduction.1f90
! type:       F-free

module m
    type node_t
        integer :: val
        type(node_t), pointer :: next
    end type
end module m

function linked_list_sum(p) result(res)
    use m
    implicit none
    type(node_t), pointer :: p
    type(node_t), pointer :: aux
    integer :: res

    res = 0

    !$omp taskgroup task_reduction(+: res)
        aux => p
        do while (associated(aux))
            !$omp task in_reduction(+: res)
                res = res + aux%val
            !$omp end task
            aux => aux%next
        end do
    !$omp end taskgroup
end function linked_list_sum


program main
    use m
    implicit none
    type(node_t), pointer :: root, aux
    integer :: res, i
    integer, parameter :: N=10

    interface
        function linked_list_sum(p) result(res)
            use m
            implicit none
            type(node_t), pointer :: p
            integer :: res
        end function
    end interface
!                       Create the root node.
    allocate(root)
    root%val = 1
    aux => root

!                       Create N-1 more nodes.
    do i = 2,N
        allocate(aux%next)
        aux => aux%next
        aux%val = i
    end do

    aux%next => null()

    !$omp parallel
    !$omp single
        res = linked_list_sum(root)
        print *, "Calculated:", res, " Analytic:", (N*(N+1))/2
    !$omp end single
    !$omp end parallel

end program main




In OpenMP 5.0 the `task`  _reduction-modifier_  for the `reduction` clause was introduced to provide a means of performing reductions among implicit and explicit tasks.

The `reduction` clause of a `parallel` or worksharing construct may specify the `task`  _reduction-modifier_  to include explicit task reductions within their region, provided the reduction operators ( _reduction-identifiers_ ) and variables ( _list items_ ) of the participating tasks match those of the implicit tasks.

There are 2 reduction use cases (identified by USE CASE #) in the  _task_reduction.2_  example below.

In USE CASE 1 a `task` modifier in the `reduction` clause  of the `parallel` construct is used to include the reductions of any  participating tasks, those with an `in_reduction` clause and matching   _reduction-identifiers_  (`+`) and list items (`x`).

Note, a `taskgroup` construct (with a `task_reduction` clause) in not necessary to scope the explicit task reduction (as seen in the example above).  Hence, even without the implicit task reduction statement (without the C `x++`   and Fortran `x=x+1` statements), the `task`  _reduction-modifier_   in a `reduction` clause of the `parallel` construct can be used to avoid having to create a `taskgroup` construct  (and its `task_reduction` clause) around the task generating structure.

In USE CASE 2 tasks participating in the reduction are within a worksharing region (a parallel worksharing-loop construct). Here, too, no `taskgroup` is required, and the  _reduction-identifier_  (`+`) and list item (variable `x`) match as required.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name:       task_reduction.2.c
* type:       C
* version: omp_5.0
*/
#include <stdio.h>
int main(void){
   int N=100, M=10;
   int i, x;

// USE CASE 1  explicit-task reduction + parallel reduction clause
   x=0;
   #pragma omp parallel num_threads(M) reduction(task,+:x)
   {

     x++;                // implicit task reduction statement

     #pragma omp single
     for(i=0;i<N;i++)
       #pragma omp task in_reduction(+:x)
       x++;

   }
   printf("x=%d  =M+N\n",x);  // x= 110  =M+N


// USE CASE 2  task reduction +  worksharing reduction clause
   x=0;
   #pragma omp parallel for num_threads(M) reduction(task,+:x)
   for(i=0; i< N; i++){

      x++;

      if( i%2 == 0){
       #pragma omp task in_reduction(+:x)
       x--;
      }
   }
   printf("x=%d  =N-N/2\n",x);  // x= 50  =N-N/2

   return 0;
}



In [None]:

! name:       task_reduction.2.f90
! type:       F-free
! version:    omp_5.0

program task_modifier

   integer :: N=100, M=10
   integer :: i, x

! USE CASE 1  explicit-task reduction + parallel reduction clause
   x=0
   !$omp parallel num_threads(M) reduction(task,+:x)

     x=x+1                   !! implicit task reduction statement

     !$omp single
       do i = 1,N
         !$omp task in_reduction(+:x)
           x=x+1
         !$omp end task
       end do
     !$omp end single

   !$omp end parallel
   write(*,'("x=",I0," =M+N")') x   ! x= 110 =M+N


! USE CASE 2  task reduction +  worksharing reduction clause
   x=0
   !$omp parallel do num_threads(M) reduction(task,+:x)
     do i = 1,N

        x=x+1

        if( mod(i,2) == 0) then
           !$omp task in_reduction(+:x)
             x=x-1
           !$omp end task
        endif

     end do
   write(*,'("x=",I0,"  =N-N/2")') x   ! x= 50 =N-N/2

end program



### Reduction on Combined Target Constructs

When a `reduction` clause appears on a combined construct that combines  a `target` construct with another construct, there is an implicit map  of the list items with a `tofrom` map type for the `target` construct.  Otherwise, the list items (if they are scalar variables) would be  treated as firstprivate by default in the `target` construct, which  is unlikely to provide the intended behavior since the result of the reduction that is in the firstprivate variable would be discarded  at the end of the `target` region.

In the following example, the use of the `reduction` clause on `sum1` or `sum2` should, by default, result in an implicit `tofrom` map for that variable. So long as neither `sum1` nor `sum2` were already present on the device, the mapping behavior ensures the value for `sum1` computed in the first `target` construct is used in the second `target` construct.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_reduction.1.c
* type: C
* version: omp_5.0
*/
#include <stdio.h>
int f(int);
int g(int);
int main()
{
   int sum1=0, sum2=0;
   int i;
   const int n = 100;

   #pragma omp target teams distribute reduction(+:sum1)
   for (int i = 0; i < n; i++) {
      sum1 += f(i);
   }

   #pragma omp target teams distribute reduction(+:sum2)
   for (int i = 0; i < n; i++) {
      sum2 += g(i) * sum1;
   }

   printf(  "sum1 = %d, sum2 = %d\n", sum1, sum2);
   //OUTPUT: sum1 = 9900, sum2 = 147015000
   return 0;
}

int f(int res){ return res*2; }
int g(int res){ return res*3; }



In [None]:

! name: target_reduction.1.f90
! type: F-free
! version: omp_5.0
program target_reduction_ex1
   interface
      function f(res)
             integer :: f, res
          end function
      function g(res)
             integer :: g, res
          end function
   end interface
   integer :: sum1, sum2, i
   integer, parameter :: n = 100
   sum1 = 0
   sum2 = 0
   !$omp target teams distribute reduction(+:sum1)
       do i=1,n
          sum1 = sum1 + f(i)
       end do
   !$omp target teams distribute reduction(+:sum2)
       do i=1,n
          sum2 = sum2 + g(i)*sum1
       end do
   print *, "sum1 = ", sum1, ", sum2 = ", sum2
   !!OUTPUT: sum1 =     10100 , sum2 = 153015000
end program


integer function f(res)
   integer :: res
   f = res*2
end function
integer function g(res)
   integer :: res
   g = res*3
end function



In next example,  the variables `sum1` and `sum2` remain on the device for the duration of the `target` `data` region so that it is their device copies that are updated by the reductions. Note the significance of mapping `sum1` on the second `target` construct; otherwise, it would be treated by default as firstprivate and the result computed for `sum1` in the prior `target` region may not be used. Alternatively, a `target` `update` construct could be used between the two `target` constructs to update the host version of `sum1` with the value that is in the corresponding device version after the completion of the first construct.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_reduction.2.c
* type: C
* version: omp_5.0
*/
#include <stdio.h>
int f(int);
int g(int);
int main()
{
   int sum1=0, sum2=0;
   int i;
   const int n = 100;

   #pragma omp target data map(sum1,sum2)
   {
      #pragma omp target teams distribute reduction(+:sum1)
      for (int i = 0; i < n; i++) {
         sum1 += f(i);
      }

      #pragma omp target teams distribute map(sum1) reduction(+:sum2)
      for (int i = 0; i < n; i++) {
         sum2 += g(i) * sum1;
      }
   }
   printf(  "sum1 = %d, sum2 = %d\n", sum1, sum2);
   //OUTPUT: sum1 = 9900, sum2 = 147015000
   return 0;
}

int f(int res){ return res*2; }
int g(int res){ return res*3; }



In [None]:

! name: target_reduction.2.f90
! type: F-free
! version: omp_5.0

program target_reduction_ex2
   interface
      function f(res)
             integer :: f, res
          end function
      function g(res)
             integer :: g, res
          end function
   end interface
   integer :: sum1, sum2, i
   integer, parameter :: n = 100
   sum1 = 0
   sum2 = 0
   !$omp target data map(sum1, sum2)
       !$omp target teams distribute reduction(+:sum1)
           do i=1,n
              sum1 = sum1 + f(i)
           end do
       !$omp target teams distribute map(sum1) reduction(+:sum2)
           do i=1,n
              sum2 = sum2 + g(i)*sum1
           end do
   !$omp end target data
   print *, "sum1 = ", sum1, ", sum2 = ", sum2
   !!OUTPUT: sum1 =     10100 , sum2 = 153015000
end program


integer function f(res)
   integer :: res
   f = res*2
end function
integer function g(res)
   integer :: res
   g = res*3
end function



### Task Reduction with Target Constructs

The following examples illustrate how task reductions can apply to target tasks that result from a `target` construct with the `in_reduction` clause. Here, the `in_reduction` clause specifies that the target task participates in the task reduction defined in the scope of the enclosing `taskgroup` construct. Partial results from all tasks participating in the task reduction will be combined (in some order) into the original variable listed in the `task_reduction` clause before exiting the `taskgroup` region.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.1.c
* type: C
* version: omp_5.0
*/
#include <stdio.h>
#pragma omp declare target to(device_compute)
void device_compute(int *);
void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel master
   #pragma omp taskgroup task_reduction(+:sum)
   {
      #pragma omp target in_reduction(+:sum) nowait
          device_compute(&sum);

      #pragma omp task in_reduction(+:sum)
          host_compute(&sum);
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }



In [None]:

! name: target_task_reduction.1.f90
! type: F-free
! version: omp_5.0
program target_task_reduction_ex1
   interface
      subroutine device_compute(res)
      !$omp declare target to(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel master
      !$omp taskgroup task_reduction(+:sum)
         !$omp target in_reduction(+:sum) nowait
            call device_compute(sum)
         !$omp end target
         !$omp task in_reduction(+:sum)
            call host_compute(sum)
         !$omp end task
      !$omp end taskgroup
   !$omp end parallel master
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program

subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine



In the next pair of examples, the task reduction is defined by a `reduction` clause with the `task` modifier, rather than a `task_reduction` clause on a `taskgroup` construct. Again, the partial results from the participating tasks will be combined in some order into the original reduction variable, `sum`.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.2.c
* type: C
* version: omp_5.0
*/
#include <stdio.h>
#pragma omp declare target to(device_compute)
extern void device_compute(int *);
extern void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel sections reduction(task, +:sum)
   {
      #pragma omp section
          {
             #pragma omp target in_reduction(+:sum)
             device_compute(&sum);
          }
      #pragma omp section
          {
             host_compute(&sum);
          }
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }



In [None]:

! name: target_task_reduction.2.f90
! type: F-free
! version: omp_5.0
program target_task_reduction_ex2
   interface
      subroutine device_compute(res)
      !$omp declare target to(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel sections reduction(task,+:sum)
      !$omp section
         !$omp target in_reduction(+:sum) nowait
           call device_compute(sum)
         !$omp end target
      !$omp section
         call host_compute(sum)
   !$omp end parallel sections
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program

subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine



Next, the `task` modifier is again used to define a task reduction over participating tasks. This time, the participating tasks are a target task resulting from a `target` construct with the `in_reduction` clause, and the implicit task (executing on the master thread) that calls `host_compute`. As before, the partial results from these paricipating tasks are combined in some order into the original reduction variable.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: target_task_reduction.2b.c
* type: C
* version: omp_5.0
*/
#include <stdio.h>
#pragma omp declare target to(device_compute)
extern void device_compute(int *);
extern void host_compute(int *);
int main()
{
   int sum = 0;

   #pragma omp parallel master reduction(task, +:sum)
   {
       #pragma omp target in_reduction(+:sum) nowait
       device_compute(&sum);

       host_compute(&sum);
   }
   printf(  "sum = %d\n", sum);
   //OUTPUT: sum = 2
   return 0;
}

void device_compute(int *sum){ *sum = 1; }
void   host_compute(int *sum){ *sum = 1; }



In [None]:

! name: target_task_reduction.2b.f90
! type: F-free
! version: omp_5.0
program target_task_reduction_ex2b
   interface
      subroutine device_compute(res)
      !$omp declare target to(device_compute)
        integer :: res
      end subroutine device_compute
      subroutine host_compute(res)
        integer :: res
      end subroutine host_compute
   end interface
   integer :: sum
   sum = 0
   !$omp parallel master reduction(task,+:sum)
         !$omp target in_reduction(+:sum) nowait
           call device_compute(sum)
         !$omp end target
         call host_compute(sum)
   !$omp end parallel sections
   print *, "sum = ", sum
   !!OUTPUT: sum = 2
end program


subroutine device_compute(sum)
   integer :: sum
   sum = 1
end subroutine
subroutine host_compute(sum)
   integer :: sum
   sum = 1
end subroutine




### Taskloop Reduction

In the OpenMP 5.0 Specification the `taskloop` construct was extended to include the reductions.

The following two examples show how to implement a reduction over an array using taskloop reduction in two different ways. In the first example we apply the `reduction` clause to the `taskloop` construct. As it was explained above in the task reduction examples, a reduction over tasks is divided in two components: the scope of the reduction, which is defined by a `taskgroup` region, and the tasks that participate in the reduction. In this example, the `reduction` clause defines both semantics. First, it specifies that the implicit `taskgroup` region associated with the `taskloop` construct is the scope of the reduction, and second, it defines all tasks created by the `taskloop` construct as participants of the reduction. About the first property, it is important to note that if we add the `nogroup` clause to the `taskloop` construct the code will be nonconforming, basically because we have a set of tasks that participate in a reduction that has not been defined.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name:       taskloop_reduction.1.c
* type:       C
* version:    omp_5.0
*/
#include <stdio.h>

int array_sum(int n, int *v) {
    int i;
    int res = 0;

    #pragma omp taskloop reduction(+: res)
    for(i = 0; i < n; ++i)
        res += v[i];

    return res;
}

int main(int argc, char *argv[]) {
    int n = 10;
    int v[10] = {1,2,3,4,5,6,7,8,9,10};

    #pragma omp parallel
    #pragma omp single
    {
        int res = array_sum(n, v);
        printf("The result is %d\n", res);
    }
    return 0;
}



In [None]:

! name: taskloop_reduction.1.f90
! type: F-free
! version:    omp_5.0

function array_sum(n, v) result(res)
    implicit none
    integer :: n, v(n), res
    integer :: i

    res = 0
    !$omp taskloop reduction(+: res)
    do i=1, n
        res = res + v(i)
    end do
    !$omp end taskoop

end function array_sum

program main
    implicit none
    integer :: n, v(10), res
    integer :: i

    integer, external :: array_sum

    n = 10
    do i=1, n
        v(i) = i
    end do

    !$omp parallel
    !$omp single
    res = array_sum(n, v)
    print *, "The result is", res
    !$omp end single
    !$omp end parallel
end program main



The second example computes exactly the same value as in the preceding _taskloop_reduction.1_  code section, but in a very different way. First, in the  _array_sum_  function a `taskgroup` region is created  that defines the scope of a new reduction using the `task_reduction` clause. After that, a task and also the tasks generated by a taskloop participate in  that reduction by using the `in_reduction` clause on the `task` and `taskloop` constructs, respectively.  Note that the `nogroup` clause was added to the `taskloop` construct. This is allowed because what is expressed with the `in_reduction` clause is different from what is expressed with the `reduction` clause. In one case the generated tasks are specified to participate in a previously  declared reduction (`in_reduction` clause) whereas in the other case creation of a new reduction is specified and also that all tasks generated  by the taskloop will participate on it.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name:       taskloop_reduction.2.c
* type:       C
* version:    omp_5.0
*/
#include <stdio.h>

int array_sum(int n, int *v) {
    int i;
    int res = 0;

    #pragma omp taskgroup task_reduction(+: res)
    {
        if (n > 0) {
            #pragma omp task in_reduction(+: res)
            res = res + v[0];

            #pragma omp taskloop in_reduction(+: res) nogroup
            for(i = 1; i < n; ++i)
                res += v[i];
        }
    }

    return res;
}

int main(int argc, char *argv[]) {
    int n = 10;
    int v[10] = {1,2,3,4,5,6,7,8,9,10};

    #pragma omp parallel
    #pragma omp single
    {
        int res = array_sum(n, v);
        printf("The result is %d\n", res);
    }
    return 0;
}



In [None]:

! name: taskloop_reduction.2.f90
! type: F-free
! version:    omp_5.0

function array_sum(n, v) result(res)
    implicit none
    integer :: n, v(n), res
    integer :: i

    res = 0
    !$omp taskgroup task_reduction(+: res)
    if (n > 0) then
        !$omp task in_reduction(+: res)
        res = res + v(1)
        !$omp end task

        !$omp taskloop in_reduction(+: res) nogroup
        do i=2, n
            res = res + v(i)
        end do
        !$omp end taskoop
    endif
    !$omp end taskgroup

end function array_sum

program main
    implicit none
    integer :: n, v(10), res
    integer :: i

    integer, external :: array_sum

    n = 10
    do i=1, n
        v(i) = i
    end do

    !$omp parallel
    !$omp single
    res = array_sum(n, v)
    print *, "The result is", res
    !$omp end single
    !$omp end parallel
end program main



In the OpenMP 5.0 Specification, `reduction` clauses for the `taskloop` ` simd` construct were also added.

The examples below compare reductions for the `taskloop` and the `taskloop` `simd` constructs. These examples illustrate the use of `reduction` clauses within  "stand-alone" `taskloop` constructs, and the use of `in_reduction` clauses for tasks of taskloops to participate with other reductions within the scope of a parallel region.

**taskloop reductions:**

In the  _taskloop reductions_  section of the example below,   _taskloop 1_  uses the `reduction` clause  in a `taskloop` construct for a sum reduction, accumulated in  _asum_ .  The behavior is as though a `taskgroup` construct encloses the  taskloop region with a `task_reduction` clause, and each taskloop task has an `in_reduction` clause with the specifications  of the `reduction` clause. At the end of the taskloop region  _asum_  contains the result of the reduction.

The next taskloop,  _taskloop 2_ , illustrates the use of the  `in_reduction` clause to participate in a previously defined reduction scope of a `parallel` construct.

The task reductions of  _task 2_  and  _taskloop 2_  are combined across the `taskloop` construct and the single `task` construct, as specified in the `reduction(task,` `+:asum)` clause of the `parallel` construct. At the end of the parallel region  _asum_  contains the combined result of all reductions.

**taskloop simd reductions:**

Reductions for the `taskloop` `simd` construct are shown in the second half of the code. Since each component construct, `taskloop` and `simd`,  can accept a reduction-type clause, the `taskloop` `simd` construct is a composite construct, and the specific application of the reduction clause is defined within the `taskloop` `simd` construct section of the OpenMP 5.0 Specification. The code below illustrates use cases for these reductions.

In the  _taskloop simd reduction_  section of the example below,  _taskloop simd 3_  uses the `reduction` clause  in a `taskloop` `simd` construct for a sum reduction within a loop. For this case a `reduction` clause is used, as one would use  for a `simd` construct. The SIMD reductions of each task are combined, and the results of these tasks are further  combined just as in the `taskloop` construct with the `reduction` clause for  _taskloop 1_ . At the end of the taskloop region  _asum_  contains the combined result of all reductions.

If a `taskloop` `simd` construct is to participate in a previously defined  reduction scope, the reduction participation should be specified with a `in_reduction` clause, as shown in the `parallel` region enclosing  _task 4_  and  _taskloop simd 4_  code sections.

Here the `taskloop` `simd` construct's  `in_reduction` clause specifies participation of the construct's tasks as  a task reduction within the scope of the parallel region.   That is, the results of each task of the `taskloop` construct component  contribute to the reduction in a broader level, just as in  _parallel reduction a_  code section above. Also, each `simd`-component construct occurs as if it has a `reduction` clause, and the SIMD results of each task are combined as though to form a single result for each task (that participates in the `in_reduction` clause). At the end of the parallel region  _asum_  contains the combined result of all reductions.

In [None]:
//%compiler: clang
//%cflags: -fopenmp

/*
* name: taskloop_simd_reduction.1c
* type: C
* version: omp_5.0
*/

#include <stdio.h>
#define N 100

int main(){
  int i, a[N], asum=0;

  for(i=0;i<N;i++) a[i]=i;

// taskloop reductions

  #pragma omp parallel master
  #pragma omp taskloop reduction(+:asum)                //taskloop 1
    for(i=0;i<N;i++){ asum += a[i]; }


  #pragma omp parallel reduction(task, +:asum)          // parallel reduction a
  {
     #pragma omp master
     #pragma omp task            in_reduction(+:asum)    //task 2
       for(i=0;i<N;i++){ asum += a[i]; }

     #pragma omp master taskloop in_reduction(+:asum)    //taskloop 2
       for(i=0;i<N;i++){ asum += a[i]; }
  }

// taskloop simd reductions

  #pragma omp parallel master
  #pragma omp taskloop simd reduction(+:asum)            //taskloop simd 3
    for(i=0;i<N;i++){ asum += a[i]; }


  #pragma omp parallel reduction(task, +:asum)          // parallel reduction b
  {
     #pragma omp master
     #pragma omp task                 in_reduction(+:asum) //task 4
       for(i=0;i<N;i++){ asum += a[i]; }

     #pragma omp master taskloop simd in_reduction(+:asum) //taskloop simd 4
       for(i=0;i<N;i++){ asum += a[i]; }

  }

  printf("asum=%d \n",asum); // output: asum=29700
}



In [None]:

! name: taskloop_simd_reduction.1f90
! type:       F-free
! version:    omp_5.0

program main

  use omp_lib
  integer, parameter ::  N=100
  integer            :: i, a(N), asum=0

  a = [( i, i=1,N )]    !! initialize

!! taskloop reductions

  !$omp parallel master
  !$omp taskloop reduction(+:asum)                     !! taskloop 1
    do i=1,N;  asum = asum + a(i);  enddo
  !$omp end taskloop
  !$omp end parallel master


  !$omp parallel reduction(task, +:asum)               !! parallel reduction a

     !$omp master
     !$omp task            in_reduction(+:asum)        !! task 2
       do i=1,N;  asum = asum + a(i);  enddo
     !$omp end task
     !$omp end master

     !$omp master taskloop in_reduction(+:asum)        !! taskloop 2
       do i=1,N;  asum = asum + a(i);  enddo
     !$omp end master taskloop

  !$omp end parallel

!! taskloop simd reductions

  !$omp parallel master
  !$omp taskloop simd reduction(+:asum)                !! taskloop simd 3
    do i=1,N;  asum = asum + a(i);  enddo
  !$omp end taskloop simd
  !$omp end parallel master


  !$omp parallel reduction(task, +:asum)               !! parallel reduction b

    !$omp master
    !$omp task                 in_reduction(+:asum)    !! task 4
       do i=1,N;  asum = asum + a(i);  enddo
    !$omp end task
    !$omp end master

    !$omp master taskloop simd in_reduction(+:asum)    !! taskloop simd 4
       do i=1,N;  asum = asum + a(i);  enddo
    !$omp end master taskloop simd

  !$omp end parallel

  print*,"asum=",asum   !! output: asum=30300

end program

