diff --git a/groups/CLX/MEM_DP.txt b/groups/CLX/MEM_DP.txt index 875ce6a74..41d20c391 100644 --- a/groups/CLX/MEM_DP.txt +++ b/groups/CLX/MEM_DP.txt @@ -42,9 +42,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -59,7 +60,7 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) -- Profiling group to measure memory bandwidth drawn by all cores of a socket. diff --git a/groups/CLX/MEM_SP.txt b/groups/CLX/MEM_SP.txt index e81cd88de..4d8c92283 100644 --- a/groups/CLX/MEM_SP.txt +++ b/groups/CLX/MEM_SP.txt @@ -42,7 +42,7 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) LONG @@ -59,7 +59,7 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) -- Profiling group to measure memory bandwidth drawn by all cores of a socket. diff --git a/groups/ICL/MEM_DP.txt b/groups/ICL/MEM_DP.txt index 0caa3bd9d..f0c3214f0 100644 --- a/groups/ICL/MEM_DP.txt +++ b/groups/ICL/MEM_DP.txt @@ -34,7 +34,7 @@ Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C1+MBOX0C2)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C1+MBOX0C2)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) LONG @@ -52,8 +52,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(DRAM_READS+DRAM_WRITES)*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(DRAM_READS+DRAM_WRITES)*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/ICL/MEM_SP.txt b/groups/ICL/MEM_SP.txt index 8c126ad2e..c4fd65cf8 100644 --- a/groups/ICL/MEM_SP.txt +++ b/groups/ICL/MEM_SP.txt @@ -34,7 +34,7 @@ Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C1+MBOX0C2)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C1+MBOX0C2)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) LONG @@ -52,8 +52,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(DRAM_READS+DRAM_WRITES)*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(DRAM_READS+DRAM_WRITES)*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) + - Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/ICX/MEM_DP.txt b/groups/ICX/MEM_DP.txt index 50b24c4f5..c07ab6467 100644 --- a/groups/ICX/MEM_DP.txt +++ b/groups/ICX/MEM_DP.txt @@ -47,9 +47,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -64,8 +65,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/ICX/MEM_SP.txt b/groups/ICX/MEM_SP.txt index 61af03012..6f7056040 100644 --- a/groups/ICX/MEM_SP.txt +++ b/groups/ICX/MEM_SP.txt @@ -47,9 +47,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2+PMC3)/(PMC0+PMC1+PMC2+PMC3) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -64,8 +65,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/arm64fx/MEM_DP.txt b/groups/arm64fx/MEM_DP.txt index 38084737d..86511373d 100644 --- a/groups/arm64fx/MEM_DP.txt +++ b/groups/arm64fx/MEM_DP.txt @@ -21,10 +21,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 Memory bandwidth [MBytes/s] 1.0E-06*((PMC2-(PMC4+PMC5))+PMC3)*256.0/time Memory data volume [GBytes] 1.0E-09*((PMC2-(PMC4+PMC5))+PMC3)*256.0 -Operational intensity (FP) PMC0/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE128) (((PMC1*128.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE256) (((PMC1*256.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE512) (((PMC1*512.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP) [FLOP/Byte] PMC0/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE128) [FLOP/Byte] (((PMC1*128.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE256) [FLOP/Byte] (((PMC1*256.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE512) [FLOP/Byte] (((PMC1*512.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) LONG @@ -39,10 +39,10 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*256.0/runtime Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*256.0 Memory bandwidth [MBytes/s] = 1.0E-06*((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0/runtime Memory data volume [GBytes] = 1.0E-09*((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0 -Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP) [FLOP/Byte] = FP_DP_FIXED_OPS_SPEC/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE128) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE256) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE512) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) - Profiling group to measure memory bandwidth and double-precision FP rate for scalar and SVE vector operations with different widths. The events for the SVE metrics assumes that all vector elements diff --git a/groups/arm64fx/MEM_SP.txt b/groups/arm64fx/MEM_SP.txt index 362689669..5808b19d8 100644 --- a/groups/arm64fx/MEM_SP.txt +++ b/groups/arm64fx/MEM_SP.txt @@ -21,10 +21,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(PMC3)*256.0/time Memory write data volume [GBytes] 1.0E-09*(PMC3)*256.0 Memory bandwidth [MBytes/s] 1.0E-06*((PMC2-(PMC4+PMC5))+PMC3)*256.0/time Memory data volume [GBytes] 1.0E-09*((PMC2-(PMC4+PMC5))+PMC3)*256.0 -Operational intensity (FP) PMC0/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE128) (((PMC1*128.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE256) (((PMC1*256.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) -Operational intensity (FP+SVE512) (((PMC1*512.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP) [FLOP/Byte] PMC0/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE128) [FLOP/Byte] (((PMC1*128.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE256) [FLOP/Byte] (((PMC1*256.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) +Operational intensity (FP+SVE512) [FLOP/Byte] (((PMC1*512.0)/128.0)+PMC0)/(((PMC2-(PMC4+PMC5))+PMC3)*256.0) LONG @@ -39,10 +39,10 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(L2D_CACHE_WB)*256.0/runtime Memory write data volume [GBytes] = 1.0E-09*(L2D_CACHE_WB)*256.0 Memory bandwidth [MBytes/s] = 1.0E-06*((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0/runtime Memory data volume [GBytes] = 1.0E-09*((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0 -Operational intensity (FP) = FP_DP_FIXED_OPS_SPEC/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE128) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE256) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) -Operational intensity (FP+SVE512) = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP) [FLOP/Byte] = FP_DP_FIXED_OPS_SPEC/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE128) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*128)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE256) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*256)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) +Operational intensity (FP+SVE512) [FLOP/Byte] = (FP_DP_FIXED_OPS_SPEC+((FP_DP_SCALE_OPS_SPEC*512)/128)/(((L2D_CACHE_REFILL-(L2D_SWAP_DM+L2D_CACHE_MIBMCH_PRF))+L2D_CACHE_WB)*256.0) - Profiling group to measure memory bandwidth and single-precision FP rate for scalar and SVE vector operations with different widths. The events for the SVE metrics assumes that all vector elements diff --git a/groups/broadwellD/MEM_DP.txt b/groups/broadwellD/MEM_DP.txt index 96f9b2e32..e742b4401 100644 --- a/groups/broadwellD/MEM_DP.txt +++ b/groups/broadwellD/MEM_DP.txt @@ -45,9 +45,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -62,8 +63,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/broadwellD/MEM_SP.txt b/groups/broadwellD/MEM_SP.txt index 47084a1f7..4de064b08 100644 --- a/groups/broadwellD/MEM_SP.txt +++ b/groups/broadwellD/MEM_SP.txt @@ -45,7 +45,7 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) LONG @@ -62,8 +62,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/broadwellEP/MEM_DP.txt b/groups/broadwellEP/MEM_DP.txt index ec02ebcd5..da9adb423 100644 --- a/groups/broadwellEP/MEM_DP.txt +++ b/groups/broadwellEP/MEM_DP.txt @@ -45,9 +45,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -62,8 +63,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/broadwellEP/MEM_SP.txt b/groups/broadwellEP/MEM_SP.txt index 18b6ffb49..ac917d585 100644 --- a/groups/broadwellEP/MEM_SP.txt +++ b/groups/broadwellEP/MEM_SP.txt @@ -45,7 +45,7 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) LONG @@ -62,8 +62,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/ivybridgeEP/MEM_DP.txt b/groups/ivybridgeEP/MEM_DP.txt index 65ae30fab..b2e35dd2a 100644 --- a/groups/ivybridgeEP/MEM_DP.txt +++ b/groups/ivybridgeEP/MEM_DP.txt @@ -45,9 +45,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -62,8 +63,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/ivybridgeEP/MEM_SP.txt b/groups/ivybridgeEP/MEM_SP.txt index 3d5d27b9b..ec52ffedf 100644 --- a/groups/ivybridgeEP/MEM_SP.txt +++ b/groups/ivybridgeEP/MEM_SP.txt @@ -45,9 +45,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -62,8 +63,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/sandybridgeEP/MEM_DP.txt b/groups/sandybridgeEP/MEM_DP.txt index e953c3746..df0cf2552 100644 --- a/groups/sandybridgeEP/MEM_DP.txt +++ b/groups/sandybridgeEP/MEM_DP.txt @@ -37,9 +37,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -54,8 +55,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/sandybridgeEP/MEM_SP.txt b/groups/sandybridgeEP/MEM_SP.txt index f893765b9..a97102d9a 100644 --- a/groups/sandybridgeEP/MEM_SP.txt +++ b/groups/sandybridgeEP/MEM_SP.txt @@ -37,9 +37,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -54,8 +55,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/((SUM(MBOXxC0)+SUM(MBOXxC1))*64.0) Vectorization ratio [%] = 100*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/(FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE+FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/skylake/MEM_DP.txt b/groups/skylake/MEM_DP.txt index a4580bf98..5522dfadd 100644 --- a/groups/skylake/MEM_DP.txt +++ b/groups/skylake/MEM_DP.txt @@ -31,7 +31,7 @@ Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C1+MBOX0C2)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0)/((MBOX0C1+MBOX0C2)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) LONG @@ -48,8 +48,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(DRAM_READS+DRAM_WRITES)*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/(DRAM_READS+DRAM_WRITES)*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/skylake/MEM_SP.txt b/groups/skylake/MEM_SP.txt index fc3ef44aa..18600d577 100644 --- a/groups/skylake/MEM_SP.txt +++ b/groups/skylake/MEM_SP.txt @@ -31,7 +31,7 @@ Memory evict bandwidth [MBytes/s] 1.0E-06*MBOX0C2*64.0/time Memory evict data volume [GBytes] 1.0E-09*MBOX0C2*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C1+MBOX0C2)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0)/((MBOX0C1+MBOX0C2)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) LONG @@ -48,8 +48,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*DRAM_WRITES*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*DRAM_WRITES*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_READS+DRAM_WRITES)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_READS+DRAM_WRITES)*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(DRAM_READS+DRAM_WRITES)*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/(DRAM_READS+DRAM_WRITES)*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) + - Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/skylakeX/MEM_DP.txt b/groups/skylakeX/MEM_DP.txt index f70b12efd..df7ca2e4b 100644 --- a/groups/skylakeX/MEM_DP.txt +++ b/groups/skylakeX/MEM_DP.txt @@ -42,7 +42,7 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1+PMC2*4.0+PMC3*8.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) LONG @@ -59,8 +59,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4+FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE*8)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/(FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/skylakeX/MEM_SP.txt b/groups/skylakeX/MEM_SP.txt index f53de72b4..cd779301b 100644 --- a/groups/skylakeX/MEM_SP.txt +++ b/groups/skylakeX/MEM_SP.txt @@ -42,9 +42,10 @@ Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4 Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0/time Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0 -Operational intensity (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1+PMC2*8.0+PMC3*16.0)/((MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1)*64.0) Vectorization ratio [%] 100*(PMC0+PMC2)/(PMC0+PMC1+PMC2) + LONG Formulas: Power [W] = PWR_PKG_ENERGY/runtime @@ -59,8 +60,9 @@ Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_WR))*64.0/runtime Memory write data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_WR))*64.0 Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0 -Operational intensity = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) +Operational intensity [FLOP/Byte] = (FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8+FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE*16)/(SUM(CAS_COUNT_RD)+SUM(CAS_COUNT_WR))*64.0) Vectorization ratio [%] = 100*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/(FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE) + -- Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on diff --git a/groups/westmere/MEM_DP.txt b/groups/westmere/MEM_DP.txt index 64161dd70..15e73d1e3 100644 --- a/groups/westmere/MEM_DP.txt +++ b/groups/westmere/MEM_DP.txt @@ -35,7 +35,7 @@ Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 -Operational intensity (PMC0*2.0+PMC1)/((UPMC0+UPMC1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*2.0+PMC1)/((UPMC0+UPMC1)*64.0) LONG Formulas: @@ -56,7 +56,7 @@ Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITE Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) - Profiling group to measure memory bandwidth drawn by all cores of a socket. This group will be measured by one core per socket. The remote read BW tells diff --git a/groups/westmere/MEM_SP.txt b/groups/westmere/MEM_SP.txt index 812c7fa03..cdfd6ab5a 100644 --- a/groups/westmere/MEM_SP.txt +++ b/groups/westmere/MEM_SP.txt @@ -35,7 +35,7 @@ Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0 Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0 -Operational intensity (PMC0*4.0+PMC1)/((UPMC0+UPMC1)*64.0) +Operational intensity [FLOP/Byte] (PMC0*4.0+PMC1)/((UPMC0+UPMC1)*64.0) LONG Formulas: @@ -56,7 +56,7 @@ Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITE Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0 Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0 -Operational intensity = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) +Operational intensity [FLOP/Byte] = (FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/((UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0) - Profiling group to measure memory bandwidth drawn by all cores of a socket. This group will be measured by one core per socket. The remote read BW tells diff --git a/groups/zen/MEM_DP.txt b/groups/zen/MEM_DP.txt index 773d76513..1c2afcac1 100644 --- a/groups/zen/MEM_DP.txt +++ b/groups/zen/MEM_DP.txt @@ -18,14 +18,14 @@ CPI PMC1/PMC0 DP [MFLOP/s] 1.0E-06*(PMC2)/time Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0 -Operational intensity PMC2/((DFC0+DFC1)*64.0) +Operational intensity [FLOP/Byte] PMC2/((DFC0+DFC1)*64.0) LONG Formulas: DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0 -Operational intensity = RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL/((DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0) +Operational intensity [FLOP/Byte] = RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL/((DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0) - Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on a diff --git a/groups/zen/MEM_SP.txt b/groups/zen/MEM_SP.txt index 17f2dbbc3..59400e06d 100644 --- a/groups/zen/MEM_SP.txt +++ b/groups/zen/MEM_SP.txt @@ -18,14 +18,14 @@ CPI PMC1/PMC0 SP [MFLOP/s] 1.0E-06*(PMC2)/time Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*64.0/time Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*64.0 -Operational intensity PMC2/((DFC0+DFC1)*64.0) +Operational intensity [FLOP/Byte] PMC2/((DFC0+DFC1)*64.0) LONG Formulas: DP [MFLOP/s] = 1.0E-06*(RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL)/time Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0 -Operational intensity = RETIRED_SSE_AVX_FLOPS_SINGLE_ALL/((DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0) +Operational intensity [FLOP/Byte] = RETIRED_SSE_AVX_FLOPS_SINGLE_ALL/((DRAM_CHANNEL_0+DRAM_CHANNEL_1)*64.0) - Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on a diff --git a/groups/zen2/MEM_DP.txt b/groups/zen2/MEM_DP.txt index 3c08688c6..5890ff4bb 100644 --- a/groups/zen2/MEM_DP.txt +++ b/groups/zen2/MEM_DP.txt @@ -18,12 +18,13 @@ CPI PMC1/PMC0 DP [MFLOP/s] 1.0E-06*(PMC2)/time Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0/time Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0 -Operational intensity PMC2/((DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0) +Operational intensity [FLOP/Byte] PMC2/((DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0) LONG Formulas: Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*(4.0/(num_numadomains/num_sockets))*64.0/runtime Memory data volume [GBytes] = 1.0E-09*(DRAM_CHANNEL_0+DRAM_CHANNEL_1)*(4.0/(num_numadomains/num_sockets))*64.0 +Operational intensity [FLOP/Byte] = RETIRED_SSE_AVX_FLOPS_DOUBLE_ALL/((DRAM_CHANNEL_0+DRAM_CHANNEL_1)*(4.0/(num_numadomains/num_sockets))*64.0) - Profiling group to measure memory bandwidth drawn by all cores of a socket. Since this group is based on Uncore events it is only possible to measure on a diff --git a/groups/zen2/MEM_SP.txt b/groups/zen2/MEM_SP.txt index 928a7e578..5d0e81a97 100644 --- a/groups/zen2/MEM_SP.txt +++ b/groups/zen2/MEM_SP.txt @@ -18,7 +18,7 @@ CPI PMC1/PMC0 SP [MFLOP/s] 1.0E-06*(PMC2)/time Memory bandwidth [MBytes/s] 1.0E-06*(DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0/time Memory data volume [GBytes] 1.0E-09*(DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0 -Operational intensity PMC2/((DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0) +Operational intensity [FLOP/Byte] PMC2/((DFC0+DFC1)*(4.0/(num_numadomains/num_sockets))*64.0) LONG Formulas: