Skip to content

Commit

Permalink
Consistent timing of kernels, inlcuding MPI times, and printing std d…
Browse files Browse the repository at this point in the history
…eviation
  • Loading branch information
reguly committed Aug 21, 2013
1 parent b62c8d0 commit d1b8e18
Show file tree
Hide file tree
Showing 44 changed files with 801 additions and 2,108 deletions.
112 changes: 25 additions & 87 deletions apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.CUF
@@ -1,5 +1,5 @@
!
! auto-generated by op2.py on 2013-08-21 09:50
! auto-generated by op2.py on 2013-08-21 13:15
!

MODULE ADT_CALC_MODULE
Expand All @@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS

! adt_calcvariable declarations

REAL(kind=4) :: loopTimeHostadt_calc
REAL(kind=4) :: loopTimeKerneladt_calc
INTEGER(kind=4) :: numberCalledadt_calc

TYPE ( c_ptr ) :: planRet_adt_calc

Expand Down Expand Up @@ -180,6 +177,10 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
REAL(kind=8) :: startTime
REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming


Expand Down Expand Up @@ -227,14 +228,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
INTEGER(kind=4), SAVE :: calledTimes

INTEGER(kind=4) :: istat
REAL(kind=4) :: accumulatorHostTime
REAL(kind=4) :: accumulatorHExchTime
REAL(kind=4) :: accumulatorKernelTime
REAL(kind=8) :: KT_double
TYPE ( cudaEvent ) :: startTimeHost
TYPE ( cudaEvent ) :: endTimeHost
TYPE ( cudaEvent ) :: startTimeKernel
TYPE ( cudaEvent ) :: endTimeKernel

numberOfOpDats = 6

Expand All @@ -247,22 +240,14 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)

istat = cudaEventCreate(startTimeHost)
istat = cudaEventCreate(endTimeHost)
istat = cudaEventRecord(startTimeHost,0)
call date_and_time(values=timeArrayStart)
startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)

istat = cudaEventRecord(endTimeHost,0)
istat = cudaEventSynchronize(endTimeHost)
istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHExchTime

istat = cudaEventCreate(startTimeKernel)
istat = cudaEventCreate(endTimeKernel)
numberCalledadt_calc = numberCalledadt_calc + 1

indirectionDescriptorArray(1) = 0
indirectionDescriptorArray(2) = 0
indirectionDescriptorArray(3) = 0
Expand Down Expand Up @@ -307,12 +292,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
pthrcolSize = set%setPtr%size
CALL c_f_pointer(actualPlan_adt_calc%thrcol,pthrcol,(/pthrcolSize/))

istat = cudaEventRecord(endTimeHost,0)
istat = cudaEventSynchronize(endTimeHost)
istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)

loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
istat = cudaEventRecord(startTimeKernel,0)

blockOffset = 0

Expand Down Expand Up @@ -343,17 +322,17 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
END IF


istat = cudaEventRecord(endTimeKernel,0)
istat = cudaEventSynchronize(endTimeKernel)
istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime


CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)

KT_double = REAL((accumulatorKernelTime) / 1000.00)
istat = cudaDeviceSynchronize()
call date_and_time(values=timeArrayEnd)
endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& KT_double, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
& (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
calledTimes = calledTimes + 1
END SUBROUTINE

Expand Down Expand Up @@ -430,12 +409,8 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
REAL(kind=8) :: startTimeHost
REAL(kind=8) :: endTimeHost
REAL(kind=8) :: startTimeKernel
REAL(kind=8) :: endTimeKernel
REAL(kind=8) :: accumulatorHostTime
REAL(kind=8) :: accumulatorKernelTime
REAL(kind=8) :: startTime
REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_adt_calc = .TRUE.
type ( c_ptr ) :: planRet_adt_calc
Expand Down Expand Up @@ -467,16 +442,14 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)

n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
numberCalledadt_calc = numberCalledadt_calc+ 1

call date_and_time(values=timeArrayStart)
startTimeHost = 1.00000 * timeArrayStart(8) + &
startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)

#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
Expand Down Expand Up @@ -524,22 +497,6 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))



call date_and_time(values=timeArrayEnd)
endTimeHost = 1.00000 * timeArrayEnd(8) + &
& 1000 * timeArrayEnd(7) + &
& 60000 * timeArrayEnd(6) + &
& 3600000 * timeArrayEnd(5)

accumulatorHostTime = endTimeHost - startTimeHost
loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime

call date_and_time(values=timeArrayStart)
startTimeKernel = 1.00000 * timeArrayStart(8) + &
& 1000 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

blockOffset = 0

DO i1 = 0, actualPlan_adt_calc%ncolors - 1, 1
Expand Down Expand Up @@ -569,34 +526,15 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF


call date_and_time(values=timeArrayEnd)
endTimeKernel = 1.00000 * timeArrayEnd(8) + &
& 1000 * timeArrayEnd(7) + &
& 60000 * timeArrayEnd(6) + &
& 3600000 * timeArrayEnd(5)

accumulatorKernelTime = endTimeKernel - startTimeKernel
loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime

call date_and_time(values=timeArrayStart)
startTimeHost = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)

call date_and_time(values=timeArrayEnd)
endTimeHost = 1.00000 * timeArrayEnd(8) + &
endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)

accumulatorHostTime = endTimeHost - startTimeHost
loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& accumulatorKernelTime / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
& (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
END SUBROUTINE
END MODULE
60 changes: 8 additions & 52 deletions apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.F90
@@ -1,5 +1,5 @@
!
! auto-generated by op2.py on 2013-08-21 09:50
! auto-generated by op2.py on 2013-08-21 13:15
!

MODULE ADT_CALC_MODULE
Expand All @@ -11,9 +11,6 @@ MODULE ADT_CALC_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
REAL(kind=4) :: loopTimeHostadt_calc
REAL(kind=4) :: loopTimeKerneladt_calc
INTEGER(kind=4) :: numberCalledadt_calc


CONTAINS
Expand Down Expand Up @@ -92,12 +89,8 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
REAL(kind=8) :: startTimeHost
REAL(kind=8) :: endTimeHost
REAL(kind=8) :: startTimeKernel
REAL(kind=8) :: endTimeKernel
REAL(kind=8) :: accumulatorHostTime
REAL(kind=8) :: accumulatorKernelTime
REAL(kind=8) :: startTime
REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_adt_calc = .TRUE.
type ( c_ptr ) :: planRet_adt_calc
Expand Down Expand Up @@ -129,16 +122,14 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)

n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
numberCalledadt_calc = numberCalledadt_calc+ 1

call date_and_time(values=timeArrayStart)
startTimeHost = 1.00000 * timeArrayStart(8) + &
startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)

#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
Expand Down Expand Up @@ -186,22 +177,6 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))



call date_and_time(values=timeArrayEnd)
endTimeHost = 1.00000 * timeArrayEnd(8) + &
& 1000 * timeArrayEnd(7) + &
& 60000 * timeArrayEnd(6) + &
& 3600000 * timeArrayEnd(5)

accumulatorHostTime = endTimeHost - startTimeHost
loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime

call date_and_time(values=timeArrayStart)
startTimeKernel = 1.00000 * timeArrayStart(8) + &
& 1000 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

blockOffset = 0

DO i1 = 0, actualPlan_adt_calc%ncolors-1, 1
Expand Down Expand Up @@ -231,34 +206,15 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF


call date_and_time(values=timeArrayEnd)
endTimeKernel = 1.00000 * timeArrayEnd(8) + &
& 1000 * timeArrayEnd(7) + &
& 60000 * timeArrayEnd(6) + &
& 3600000 * timeArrayEnd(5)

accumulatorKernelTime = endTimeKernel - startTimeKernel
loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime

call date_and_time(values=timeArrayStart)
startTimeHost = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)

call date_and_time(values=timeArrayEnd)
endTimeHost = 1.00000 * timeArrayEnd(8) + &
endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)

accumulatorHostTime = endTimeHost - startTimeHost
loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& accumulatorKernelTime / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
& (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
END SUBROUTINE
END MODULE
23 changes: 22 additions & 1 deletion apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_seqkernel.F90
@@ -1,5 +1,5 @@
!
! auto-generated by op2.py on 2013-08-21 09:50
! auto-generated by op2.py on 2013-08-21 13:15
!

MODULE ADT_CALC_MODULE
Expand Down Expand Up @@ -67,6 +67,11 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &

type ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
REAL(kind=8) :: startTime
REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore

Expand All @@ -93,6 +98,14 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
opArgArray(5) = opArg5
opArgArray(6) = opArg6

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
call date_and_time(values=timeArrayStart)
startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)

n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)

opSetCore => set%setPtr
Expand All @@ -118,5 +131,13 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &

CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)

call date_and_time(values=timeArrayEnd)
endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)

returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE

0 comments on commit d1b8e18

Please sign in to comment.