Browse files

Consistent timing of kernels, inlcuding MPI times, and printing std d…

…eviation
  • Loading branch information...
1 parent b62c8d0 commit d1b8e1866061045ebae1790587eae84ee1f27751 @reguly reguly committed Aug 21, 2013
Showing with 801 additions and 2,108 deletions.
  1. +25 −87 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.CUF
  2. +8 −52 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.F90
  3. +22 −1 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_seqkernel.F90
  4. +25 −87 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_kernel.CUF
  5. +8 −52 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_kernel.F90
  6. +22 −1 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_seqkernel.F90
  7. +25 −87 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_kernel.CUF
  8. +8 −52 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_kernel.F90
  9. +22 −1 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_seqkernel.F90
  10. +25 −87 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_kernel.CUF
  11. +8 −52 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_kernel.F90
  12. +22 −1 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_seqkernel.F90
  13. +25 −92 apps/fortran/airfoil/airfoil_hdf5/dp/update_kernel.CUF
  14. +8 −52 apps/fortran/airfoil/airfoil_hdf5/dp/update_kernel.F90
  15. +22 −1 apps/fortran/airfoil/airfoil_hdf5/dp/update_seqkernel.F90
  16. +25 −87 apps/fortran/airfoil/airfoil_plain/dp/adt_calc_kernel.CUF
  17. +8 −52 apps/fortran/airfoil/airfoil_plain/dp/adt_calc_kernel.F90
  18. +25 −87 apps/fortran/airfoil/airfoil_plain/dp/bres_calc_kernel.CUF
  19. +8 −52 apps/fortran/airfoil/airfoil_plain/dp/bres_calc_kernel.F90
  20. +25 −87 apps/fortran/airfoil/airfoil_plain/dp/res_calc_kernel.CUF
  21. +8 −52 apps/fortran/airfoil/airfoil_plain/dp/res_calc_kernel.F90
  22. +25 −87 apps/fortran/airfoil/airfoil_plain/dp/save_soln_kernel.CUF
  23. +8 −52 apps/fortran/airfoil/airfoil_plain/dp/save_soln_kernel.F90
  24. +25 −92 apps/fortran/airfoil/airfoil_plain/dp/update_kernel.CUF
  25. +8 −52 apps/fortran/airfoil/airfoil_plain/dp/update_kernel.F90
  26. +1 −1 op2/c/include/op_lib_c.h
  27. +3 −0 op2/c/include/op_lib_core.h
  28. +5 −0 op2/c/src/core/op_dummy_singlenode.c
  29. +25 −12 op2/c/src/core/op_lib_core.c
  30. +43 −1 op2/c/src/mpi/op_mpi_core.c
  31. +3 −1 op2/c/src/mpi/op_mpi_decl.c
  32. +17 −0 op2/fortran/src/op2_for_declarations.F90
  33. +2 −2 translator/fortran/python/op2_fortran.py
  34. +20 −86 translator/fortran/python/op2_gen_cuda.py
  35. +135 −165 translator/fortran/python/op2_gen_cudaINC.py
  36. +13 −51 translator/fortran/python/op2_gen_cuda_old.py
  37. +19 −109 translator/fortran/python/op2_gen_cuda_permute.py
  38. +13 −1 translator/fortran/python/op2_gen_mpiseq.py
  39. +13 −1 translator/fortran/python/op2_gen_mpiseq2.py
  40. +13 −1 translator/fortran/python/op2_gen_mpiseq3.py
  41. +10 −72 translator/fortran/python/op2_gen_openmp.py
  42. +8 −64 translator/fortran/python/op2_gen_openmp2.py
  43. +8 −62 translator/fortran/python/op2_gen_openmp3.py
  44. +10 −74 translator/fortran/python/op2_gen_openmpINC.py
View
112 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE ADT_CALC_MODULE
@@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS
! adt_calcvariable declarations
-REAL(kind=4) :: loopTimeHostadt_calc
-REAL(kind=4) :: loopTimeKerneladt_calc
-INTEGER(kind=4) :: numberCalledadt_calc
TYPE ( c_ptr ) :: planRet_adt_calc
@@ -180,6 +177,10 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -227,14 +228,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
INTEGER(kind=4), SAVE :: calledTimes
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
numberOfOpDats = 6
@@ -247,22 +240,14 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledadt_calc = numberCalledadt_calc + 1
-
indirectionDescriptorArray(1) = 0
indirectionDescriptorArray(2) = 0
indirectionDescriptorArray(3) = 0
@@ -307,12 +292,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
pthrcolSize = set%setPtr%size
CALL c_f_pointer(actualPlan_adt_calc%thrcol,pthrcol,(/pthrcolSize/))
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
blockOffset = 0
@@ -343,17 +322,17 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
- & KT_double, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -430,12 +409,8 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_adt_calc = .TRUE.
type ( c_ptr ) :: planRet_adt_calc
@@ -467,16 +442,14 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledadt_calc = numberCalledadt_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -524,22 +497,6 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_adt_calc%ncolors - 1, 1
@@ -569,34 +526,15 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
60 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_kernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE ADT_CALC_MODULE
@@ -11,9 +11,6 @@ MODULE ADT_CALC_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
-REAL(kind=4) :: loopTimeHostadt_calc
-REAL(kind=4) :: loopTimeKerneladt_calc
-INTEGER(kind=4) :: numberCalledadt_calc
CONTAINS
@@ -92,12 +89,8 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_adt_calc = .TRUE.
type ( c_ptr ) :: planRet_adt_calc
@@ -129,16 +122,14 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledadt_calc = numberCalledadt_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -186,22 +177,6 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_adt_calc%ncolors-1, 1
@@ -231,34 +206,15 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
23 apps/fortran/airfoil/airfoil_hdf5/dp/adt_calc_seqkernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE ADT_CALC_MODULE
@@ -67,6 +67,11 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
type ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
+ INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore
@@ -93,6 +98,14 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
opArgArray(5) = opArg5
opArgArray(6) = opArg6
+ returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
+ & 0.d0, 0.00000,0.00000, 0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
+
n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
opSetCore => set%setPtr
@@ -118,5 +131,13 @@ SUBROUTINE adt_calc_host( userSubroutine, set, &
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
+ returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
+ & (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE
View
112 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE BRES_CALC_MODULE
@@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS
! bres_calcvariable declarations
-REAL(kind=4) :: loopTimeHostbres_calc
-REAL(kind=4) :: loopTimeKernelbres_calc
-INTEGER(kind=4) :: numberCalledbres_calc
TYPE ( c_ptr ) :: planRet_bres_calc
@@ -203,6 +200,10 @@ attributes (host) SUBROUTINE bres_calc_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -263,14 +264,6 @@ attributes (host) SUBROUTINE bres_calc_host_gpu( userSubroutine, set, &
INTEGER(kind=4), SAVE :: calledTimes
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
numberOfOpDats = 6
@@ -283,22 +276,14 @@ attributes (host) SUBROUTINE bres_calc_host_gpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledbres_calc = numberCalledbres_calc + 1
-
indirectionDescriptorArray(1) = 0
indirectionDescriptorArray(2) = 0
indirectionDescriptorArray(3) = 1
@@ -353,12 +338,6 @@ attributes (host) SUBROUTINE bres_calc_host_gpu( userSubroutine, set, &
pthrcolSize = set%setPtr%size
CALL c_f_pointer(actualPlan_bres_calc%thrcol,pthrcol,(/pthrcolSize/))
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
blockOffset = 0
@@ -392,17 +371,17 @@ attributes (host) SUBROUTINE bres_calc_host_gpu( userSubroutine, set, &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKernelbres_calc = loopTimeKernelbres_calc + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
- & KT_double, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -498,12 +477,8 @@ SUBROUTINE bres_calc_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_bres_calc = .TRUE.
type ( c_ptr ) :: planRet_bres_calc
@@ -535,16 +510,14 @@ SUBROUTINE bres_calc_host_cpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledbres_calc = numberCalledbres_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -602,22 +575,6 @@ SUBROUTINE bres_calc_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_bres_calc%ncolors - 1, 1
@@ -651,34 +608,15 @@ SUBROUTINE bres_calc_host_cpu( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelbres_calc = loopTimeKernelbres_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
60 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_kernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE BRES_CALC_MODULE
@@ -11,9 +11,6 @@ MODULE BRES_CALC_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
-REAL(kind=4) :: loopTimeHostbres_calc
-REAL(kind=4) :: loopTimeKernelbres_calc
-INTEGER(kind=4) :: numberCalledbres_calc
CONTAINS
@@ -111,12 +108,8 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_bres_calc = .TRUE.
type ( c_ptr ) :: planRet_bres_calc
@@ -148,16 +141,14 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledbres_calc = numberCalledbres_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -215,22 +206,6 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_bres_calc%ncolors-1, 1
@@ -264,34 +239,15 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelbres_calc = loopTimeKernelbres_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostbres_calc = loopTimeHostbres_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_bres_calc%transfer,actualPlan_bres_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
23 apps/fortran/airfoil/airfoil_hdf5/dp/bres_calc_seqkernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE BRES_CALC_MODULE
@@ -74,6 +74,11 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
type ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
+ INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore
@@ -112,6 +117,14 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
opArgArray(5) = opArg5
opArgArray(6) = opArg6
+ returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
+ & 0.d0, 0.00000,0.00000, 0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
+
n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
opSetCore => set%setPtr
@@ -151,5 +164,13 @@ SUBROUTINE bres_calc_host( userSubroutine, set, &
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
+ returnSetKernelTiming = setKernelTime(3 , userSubroutine//C_NULL_CHAR, &
+ & (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE
View
112 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE RES_CALC_MODULE
@@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS
! res_calcvariable declarations
-REAL(kind=4) :: loopTimeHostres_calc
-REAL(kind=4) :: loopTimeKernelres_calc
-INTEGER(kind=4) :: numberCalledres_calc
TYPE ( c_ptr ) :: planRet_res_calc
@@ -225,6 +222,10 @@ attributes (host) SUBROUTINE res_calc_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(8) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -283,14 +284,6 @@ attributes (host) SUBROUTINE res_calc_host_gpu( userSubroutine, set, &
INTEGER(kind=4), SAVE :: calledTimes
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
numberOfOpDats = 8
@@ -305,22 +298,14 @@ attributes (host) SUBROUTINE res_calc_host_gpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledres_calc = numberCalledres_calc + 1
-
indirectionDescriptorArray(1) = 0
indirectionDescriptorArray(2) = 0
indirectionDescriptorArray(3) = 1
@@ -375,12 +360,6 @@ attributes (host) SUBROUTINE res_calc_host_gpu( userSubroutine, set, &
pthrcolSize = set%setPtr%size
CALL c_f_pointer(actualPlan_res_calc%thrcol,pthrcol,(/pthrcolSize/))
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
blockOffset = 0
@@ -413,17 +392,17 @@ attributes (host) SUBROUTINE res_calc_host_gpu( userSubroutine, set, &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKernelres_calc = loopTimeKernelres_calc + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
- & KT_double, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -521,12 +500,8 @@ SUBROUTINE res_calc_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_res_calc = .TRUE.
type ( c_ptr ) :: planRet_res_calc
@@ -560,16 +535,14 @@ SUBROUTINE res_calc_host_cpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledres_calc = numberCalledres_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -627,22 +600,6 @@ SUBROUTINE res_calc_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg7%map_data,opDat7Map,(/opSetCore%size*opDat7MapDim/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_res_calc%ncolors - 1, 1
@@ -675,34 +632,15 @@ SUBROUTINE res_calc_host_cpu( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelres_calc = loopTimeKernelres_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
60 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_kernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE RES_CALC_MODULE
@@ -11,9 +11,6 @@ MODULE RES_CALC_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
-REAL(kind=4) :: loopTimeHostres_calc
-REAL(kind=4) :: loopTimeKernelres_calc
-INTEGER(kind=4) :: numberCalledres_calc
CONTAINS
@@ -113,12 +110,8 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_res_calc = .TRUE.
type ( c_ptr ) :: planRet_res_calc
@@ -152,16 +145,14 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledres_calc = numberCalledres_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -219,22 +210,6 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
CALL c_f_pointer(opArg7%map_data,opDat7Map,(/opSetCore%size*opDat7MapDim/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO i1 = 0, actualPlan_res_calc%ncolors-1, 1
@@ -267,34 +242,15 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelres_calc = loopTimeKernelres_calc + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostres_calc = loopTimeHostres_calc + accumulatorHostTime
-
returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_res_calc%transfer,actualPlan_res_calc%transfer2, 1)
END SUBROUTINE
END MODULE
View
23 apps/fortran/airfoil/airfoil_hdf5/dp/res_calc_seqkernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE RES_CALC_MODULE
@@ -79,6 +79,11 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
type ( op_arg ) , DIMENSION(8) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
+ INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore
@@ -116,6 +121,14 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
opArgArray(7) = opArg7
opArgArray(8) = opArg8
+ returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
+ & 0.d0, 0.00000,0.00000, 0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
+
n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
opSetCore => set%setPtr
@@ -152,5 +165,13 @@ SUBROUTINE res_calc_host( userSubroutine, set, &
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
+ returnSetKernelTiming = setKernelTime(2 , userSubroutine//C_NULL_CHAR, &
+ & (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE
View
112 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE SAVE_SOLN_MODULE
@@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS
! save_solnvariable declarations
-REAL(kind=4) :: loopTimeHostsave_soln
-REAL(kind=4) :: loopTimeKernelsave_soln
-INTEGER(kind=4) :: numberCalledsave_soln
@@ -100,6 +97,10 @@ attributes (host) SUBROUTINE save_soln_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(2) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -121,14 +122,6 @@ attributes (host) SUBROUTINE save_soln_host_gpu( userSubroutine, set, &
REAL(kind=4) :: dataTransfer
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
numberOfOpDats = 2
@@ -137,22 +130,14 @@ attributes (host) SUBROUTINE save_soln_host_gpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledsave_soln = numberCalledsave_soln + 1
-
blocksPerGrid = 200
threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)
@@ -165,12 +150,6 @@ attributes (host) SUBROUTINE save_soln_host_gpu( userSubroutine, set, &
CALL c_f_pointer(opArg1%data_d,opDat1Devicesave_soln,(/opDat1Cardinality/))
CALL c_f_pointer(opArg2%data_d,opDat2Devicesave_soln,(/opDat2Cardinality/))
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
CALL op_cuda_save_soln <<<blocksPerGrid,threadsPerBlock,dynamicSharedMemorySize>>>( &
& opDat1Devicesave_soln, &
@@ -182,20 +161,20 @@ attributes (host) SUBROUTINE save_soln_host_gpu( userSubroutine, set, &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKernelsave_soln = loopTimeKernelsave_soln + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
- & KT_double, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -244,12 +223,8 @@ SUBROUTINE save_soln_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: sliceStart
INTEGER(kind=4) :: sliceEnd
@@ -265,16 +240,14 @@ SUBROUTINE save_soln_host_cpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledsave_soln = numberCalledsave_soln+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef _OPENMP
numberOfThreads = omp_get_max_threads()
@@ -290,22 +263,6 @@ SUBROUTINE save_soln_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg2%data,opDat2Local,(/opDat2Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)
DO i1 = 0, numberOfThreads - 1, 1
sliceStart = opSetCore%size * i1 / numberOfThreads
@@ -322,37 +279,18 @@ SUBROUTINE save_soln_host_cpu( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelsave_soln = loopTimeKernelsave_soln + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHostTime
-
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
END SUBROUTINE
END MODULE
View
60 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_kernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE SAVE_SOLN_MODULE
@@ -11,9 +11,6 @@ MODULE SAVE_SOLN_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
-REAL(kind=4) :: loopTimeHostsave_soln
-REAL(kind=4) :: loopTimeKernelsave_soln
-INTEGER(kind=4) :: numberCalledsave_soln
CONTAINS
@@ -64,12 +61,8 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: sliceStart
INTEGER(kind=4) :: sliceEnd
@@ -85,16 +78,14 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledsave_soln = numberCalledsave_soln+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef _OPENMP
numberOfThreads = omp_get_max_threads()
@@ -110,22 +101,6 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
CALL c_f_pointer(opArg2%data,opDat2Local,(/opDat2Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)
DO i1 = 0, numberOfThreads-1, 1
sliceStart = opSetCore%size * i1 / numberOfThreads
@@ -142,37 +117,18 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelsave_soln = loopTimeKernelsave_soln + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostsave_soln = loopTimeHostsave_soln + accumulatorHostTime
-
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
END SUBROUTINE
END MODULE
View
23 apps/fortran/airfoil/airfoil_hdf5/dp/save_soln_seqkernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE SAVE_SOLN_MODULE
@@ -44,6 +44,11 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
type ( op_arg ) , DIMENSION(2) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
+ INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore
@@ -61,6 +66,14 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
opArgArray(1) = opArg1
opArgArray(2) = opArg2
+ returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
+ & 0.d0, 0.00000,0.00000, 0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
+
n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
opSetCore => set%setPtr
@@ -79,5 +92,13 @@ SUBROUTINE save_soln_host( userSubroutine, set, &
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
+ returnSetKernelTiming = setKernelTime(0 , userSubroutine//C_NULL_CHAR, &
+ & (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE
View
117 apps/fortran/airfoil/airfoil_hdf5/dp/update_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE UPDATE_MODULE
@@ -18,9 +18,6 @@ USE CUDACONFIGURATIONPARAMS
! updatevariable declarations
real(8), DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice5update
-REAL(kind=4) :: loopTimeHostupdate
-REAL(kind=4) :: loopTimeKernelupdate
-INTEGER(kind=4) :: numberCalledupdate
@@ -161,6 +158,10 @@ attributes (device) &
TYPE ( op_arg ) , DIMENSION(5) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -187,14 +188,6 @@ attributes (device) &
REAL(kind=4) :: dataTransfer
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
real(8), DIMENSION(:), POINTER :: opDat5Host
real(8), DIMENSION(:), ALLOCATABLE :: reductionArrayHost5
INTEGER(kind=4) :: reductionCardinality5
@@ -209,22 +202,14 @@ attributes (device) &
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledupdate = numberCalledupdate + 1
-
blocksPerGrid = 200
threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)
@@ -254,12 +239,6 @@ attributes (device) &
END DO
reductionArrayDevice5update = reductionArrayHost5
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
CALL op_cuda_update <<<blocksPerGrid,threadsPerBlock,dynamicSharedMemorySize>>>( &
& opDat1Deviceupdate, &
@@ -274,15 +253,8 @@ attributes (device) &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKernelupdate = loopTimeKernelupdate + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- istat = cudaEventRecord(startTimeHost,0)
reductionArrayHost5 = reductionArrayDevice5update
DO i10 = 0, reductionCardinality5 - 1, 1
@@ -292,19 +264,21 @@ attributes (device) &
deallocate( reductionArrayHost5 )
CALL op_mpi_reduce_double(opArg5,opArg5%data)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
dataTransfer = dataTransfer + opArg3%size * getSetSizeFromOpArg(opArg3) * 2.d0
dataTransfer = dataTransfer + opArg4%size * getSetSizeFromOpArg(opArg4)
dataTransfer = dataTransfer + opArg5%size * 2.d0
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
- & KT_double, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -375,12 +349,8 @@ attributes (device) &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: sliceStart
INTEGER(kind=4) :: sliceEnd
@@ -400,16 +370,14 @@ attributes (device) &
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledupdate = numberCalledupdate+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef _OPENMP
numberOfThreads = omp_get_max_threads()
@@ -436,22 +404,6 @@ attributes (device) &
END DO
END DO
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)
DO i1 = 0, numberOfThreads - 1, 1
sliceStart = opSetCore%size * i1 / numberOfThreads
@@ -471,22 +423,6 @@ attributes (device) &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelupdate = loopTimeKernelupdate + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
DO i1 = 1, numberOfThreads+1 - 1, 1
@@ -500,21 +436,18 @@ attributes (device) &
CALL op_mpi_reduce_double(opArg5,opArg5%data)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
-
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
dataTransfer = dataTransfer + opArg3%size * getSetSizeFromOpArg(opArg3) * 2.d0
dataTransfer = dataTransfer + opArg4%size * getSetSizeFromOpArg(opArg4)
dataTransfer = dataTransfer + opArg5%size * 2.d0
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
END SUBROUTINE
END MODULE
View
60 apps/fortran/airfoil/airfoil_hdf5/dp/update_kernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE UPDATE_MODULE
@@ -11,9 +11,6 @@ MODULE UPDATE_MODULE
#ifdef _OPENMP
USE OMP_LIB
#endif
-REAL(kind=4) :: loopTimeHostupdate
-REAL(kind=4) :: loopTimeKernelupdate
-INTEGER(kind=4) :: numberCalledupdate
CONTAINS
@@ -86,12 +83,8 @@ SUBROUTINE update_host( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: sliceStart
INTEGER(kind=4) :: sliceEnd
@@ -111,16 +104,14 @@ SUBROUTINE update_host( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledupdate = numberCalledupdate+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef _OPENMP
numberOfThreads = omp_get_max_threads()
@@ -147,22 +138,6 @@ SUBROUTINE update_host( userSubroutine, set, &
END DO
END DO
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)
DO i1 = 0, numberOfThreads-1, 1
sliceStart = opSetCore%size * i1 / numberOfThreads
@@ -182,22 +157,6 @@ SUBROUTINE update_host( userSubroutine, set, &
CALL op_mpi_wait_all(numberOfOpDats,opArgArray)
END IF
-
- call date_and_time(values=timeArrayEnd)
- endTimeKernel = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorKernelTime = endTimeKernel - startTimeKernel
- loopTimeKernelupdate = loopTimeKernelupdate + accumulatorKernelTime
-
- call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
- & 1000.00 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)
DO i1 = 1, numberOfThreads+1-1, 1
@@ -211,21 +170,18 @@ SUBROUTINE update_host( userSubroutine, set, &
CALL op_mpi_reduce_double(opArg5,opArg5%data)
call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
+ endTime = 1.00000 * timeArrayEnd(8) + &
1000 * timeArrayEnd(7) + &
60000 * timeArrayEnd(6) + &
3600000 * timeArrayEnd(5)
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostupdate = loopTimeHostupdate + accumulatorHostTime
-
dataTransfer = 0.0
dataTransfer = dataTransfer + opArg1%size * getSetSizeFromOpArg(opArg1)
dataTransfer = dataTransfer + opArg2%size * getSetSizeFromOpArg(opArg2) * 2.d0
dataTransfer = dataTransfer + opArg3%size * getSetSizeFromOpArg(opArg3) * 2.d0
dataTransfer = dataTransfer + opArg4%size * getSetSizeFromOpArg(opArg4)
dataTransfer = dataTransfer + opArg5%size * 2.d0
returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
- & accumulatorKernelTime / 1000.00, dataTransfer, 0.00000, 1)
+ & (endTime-startTime) / 1000.00, dataTransfer, 0.00000, 1)
END SUBROUTINE
END MODULE
View
23 apps/fortran/airfoil/airfoil_hdf5/dp/update_seqkernel.F90
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE UPDATE_MODULE
@@ -59,6 +59,11 @@ SUBROUTINE update_host( userSubroutine, set, &
type ( op_arg ) , DIMENSION(5) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
+ INTEGER(kind=4) :: returnSetKernelTiming
INTEGER(kind=4) :: n_upper
type ( op_set_core ) , POINTER :: opSetCore
@@ -86,6 +91,14 @@ SUBROUTINE update_host( userSubroutine, set, &
opArgArray(4) = opArg4
opArgArray(5) = opArg5
+ returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
+ & 0.d0, 0.00000,0.00000, 0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
+
n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
opSetCore => set%setPtr
@@ -114,5 +127,13 @@ SUBROUTINE update_host( userSubroutine, set, &
CALL op_mpi_reduce_double(opArg5,opArg5%data)
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
+ returnSetKernelTiming = setKernelTime(4 , userSubroutine//C_NULL_CHAR, &
+ & (endTime-startTime) / 1000.00,0.00000,0.00000, 1)
END SUBROUTINE
END MODULE
View
112 apps/fortran/airfoil/airfoil_plain/dp/adt_calc_kernel.CUF
@@ -1,5 +1,5 @@
!
-! auto-generated by op2.py on 2013-08-21 09:50
+! auto-generated by op2.py on 2013-08-21 13:15
!
MODULE ADT_CALC_MODULE
@@ -17,9 +17,6 @@ USE CUDACONFIGURATIONPARAMS
! adt_calcvariable declarations
-REAL(kind=4) :: loopTimeHostadt_calc
-REAL(kind=4) :: loopTimeKerneladt_calc
-INTEGER(kind=4) :: numberCalledadt_calc
TYPE ( c_ptr ) :: planRet_adt_calc
@@ -180,6 +177,10 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
TYPE ( op_arg ) , DIMENSION(6) :: opArgArray
INTEGER(kind=4) :: numberOfOpDats
INTEGER(kind=4) :: n_upper
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
+ INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
@@ -227,14 +228,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
INTEGER(kind=4), SAVE :: calledTimes
INTEGER(kind=4) :: istat
- REAL(kind=4) :: accumulatorHostTime
- REAL(kind=4) :: accumulatorHExchTime
- REAL(kind=4) :: accumulatorKernelTime
- REAL(kind=8) :: KT_double
- TYPE ( cudaEvent ) :: startTimeHost
- TYPE ( cudaEvent ) :: endTimeHost
- TYPE ( cudaEvent ) :: startTimeKernel
- TYPE ( cudaEvent ) :: endTimeKernel
numberOfOpDats = 6
@@ -247,22 +240,14 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- istat = cudaEventCreate(startTimeHost)
- istat = cudaEventCreate(endTimeHost)
- istat = cudaEventRecord(startTimeHost,0)
+ call date_and_time(values=timeArrayStart)
+ startTime = 1.00000 * timeArrayStart(8) + &
+ & 1000.00 * timeArrayStart(7) + &
+ & 60000 * timeArrayStart(6) + &
+ & 3600000 * timeArrayStart(5)
n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHExchTime,startTimeHost,endTimeHost)
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHExchTime
-
- istat = cudaEventCreate(startTimeKernel)
- istat = cudaEventCreate(endTimeKernel)
- numberCalledadt_calc = numberCalledadt_calc + 1
-
indirectionDescriptorArray(1) = 0
indirectionDescriptorArray(2) = 0
indirectionDescriptorArray(3) = 0
@@ -307,12 +292,6 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
pthrcolSize = set%setPtr%size
CALL c_f_pointer(actualPlan_adt_calc%thrcol,pthrcol,(/pthrcolSize/))
- istat = cudaEventRecord(endTimeHost,0)
- istat = cudaEventSynchronize(endTimeHost)
- istat = cudaEventElapsedTime(accumulatorHostTime,startTimeHost,endTimeHost)
-
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
- istat = cudaEventRecord(startTimeKernel,0)
blockOffset = 0
@@ -343,17 +322,17 @@ attributes (host) SUBROUTINE adt_calc_host_gpu( userSubroutine, set, &
END IF
- istat = cudaEventRecord(endTimeKernel,0)
- istat = cudaEventSynchronize(endTimeKernel)
- istat = cudaEventElapsedTime(accumulatorKernelTime,startTimeKernel,endTimeKernel)
- loopTimeKerneladt_calc = loopTimeKerneladt_calc + accumulatorKernelTime
-
-
CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)
- KT_double = REAL((accumulatorKernelTime) / 1000.00)
+ istat = cudaDeviceSynchronize()
+ call date_and_time(values=timeArrayEnd)
+ endTime = 1.00000 * timeArrayEnd(8) + &
+ 1000 * timeArrayEnd(7) + &
+ 60000 * timeArrayEnd(6) + &
+ 3600000 * timeArrayEnd(5)
+
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
- & KT_double, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
+ & (endTime-startTime) / 1000.00, actualPlan_adt_calc%transfer,actualPlan_adt_calc%transfer2, 1)
calledTimes = calledTimes + 1
END SUBROUTINE
@@ -430,12 +409,8 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
INTEGER(kind=4) :: numberOfThreads
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart
INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd
- REAL(kind=8) :: startTimeHost
- REAL(kind=8) :: endTimeHost
- REAL(kind=8) :: startTimeKernel
- REAL(kind=8) :: endTimeKernel
- REAL(kind=8) :: accumulatorHostTime
- REAL(kind=8) :: accumulatorKernelTime
+ REAL(kind=8) :: startTime
+ REAL(kind=8) :: endTime
INTEGER(kind=4) :: returnSetKernelTiming
LOGICAL :: firstTime_adt_calc = .TRUE.
type ( c_ptr ) :: planRet_adt_calc
@@ -467,16 +442,14 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
returnSetKernelTiming = setKernelTime(1 , userSubroutine//C_NULL_CHAR, &
& 0.d0, 0.00000,0.00000, 0)
-
- n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
- numberCalledadt_calc = numberCalledadt_calc+ 1
-
call date_and_time(values=timeArrayStart)
- startTimeHost = 1.00000 * timeArrayStart(8) + &
+ startTime = 1.00000 * timeArrayStart(8) + &
& 1000.00 * timeArrayStart(7) + &
& 60000 * timeArrayStart(6) + &
& 3600000 * timeArrayStart(5)
+ n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)
+
#ifdef OP_PART_SIZE_1
partitionSize = OP_PART_SIZE_1
#else
@@ -524,22 +497,6 @@ SUBROUTINE adt_calc_host_cpu( userSubroutine, set, &
CALL c_f_pointer(opArg6%data,opDat6Local,(/opDat6Cardinality/))
-
- call date_and_time(values=timeArrayEnd)
- endTimeHost = 1.00000 * timeArrayEnd(8) + &
- & 1000 * timeArrayEnd(7) + &
- & 60000 * timeArrayEnd(6) + &
- & 3600000 * timeArrayEnd(5)
-
- accumulatorHostTime = endTimeHost - startTimeHost
- loopTimeHostadt_calc = loopTimeHostadt_calc + accumulatorHostTime
-
- call date_and_time(values=timeArrayStart)
- startTimeKernel = 1.00000 * timeArrayStart(8) + &
- & 1000 * timeArrayStart(7) + &
- & 60000 * timeArrayStart(6) + &
- & 3600000 * timeArrayStart(5)
-
blockOffset = 0
DO