ROCm · hcman2 · Dec 6, 2022 · Nov 22, 2022 · Nov 25, 2022 · Nov 28, 2022
@@ -24,6 +24,7 @@
 
 from ..Component import Component, MAC
 from ..DataType import DataType
+import queue
 
 class MAC_F32_Plain(MAC):
     """
@@ -47,6 +48,10 @@ def __call__(self, writer, m, innerUnroll):
             else:
                 raise RuntimeError("FMA instruction specified but not supported on {}".format(kernel["ISA"]))
 
+        dualMacEnable = 0
+        if writer.asmCaps["v_dual_fmac_f32"] and kernel["WavefrontSize"] == 32:
+            dualMacEnable = 1
+
         if not writer.asmCaps[instruction]:
             raise RuntimeError("{} instruction specified but not supported on {}".format(instruction, kernel["ISA"]))
 
@@ -67,6 +72,8 @@ def __call__(self, writer, m, innerUnroll):
         priority = Component.Priority.find(writer)
         macIdx = 0
 
+        instQ = queue.Queue()
+
         for iui in range(0, innerUnroll):
             for idx1 in range(0, kernel["ThreadTile1"]):
                 for idx0 in range(0, kernel["ThreadTile0"]):
@@ -80,19 +87,67 @@ def __call__(self, writer, m, innerUnroll):
                     vars["aStr"] = "v[vgprValuA_X{m}_I{iui} + {a}]".format_map(vars)
                     vars["bStr"] = "v[vgprValuB_X{m}_I{iui} + {b}]".format_map(vars)
 
-                    if instruction == "v_fma_f32":
-                        kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
-                    else:
-                        kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
+                    if dualMacEnable == 1:
+                        instVars = {}
+                        instVars["endLine"] = writer.endLine
+                        instVars["cStr"] = vars["cStr"]
+                        instVars["aStr"] = vars["aStr"]
+                        instVars["bStr"] = vars["bStr"]
+                        instVars["a"] = vars["a"]
+                        instVars["b"] = vars["b"]
+                        instVars["instruction"] = instruction
+
+                        if instQ.empty():
+                            instQ.put(instVars)
+                        else:
+                            # pop instruction
+                            prevVars = instQ.queue[0]
+
+                            if self.isLegal(instVars, prevVars):
+                                # make dual fmac
+                                kStr += "v_dual_fmac_f32 {cStr}, {aStr}, {bStr}".format_map(prevVars) + " :: v_dual_fmac_f32 {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
+                                kStr += priority(writer, 1, "Raise priority while processing macs")
+                                instQ.get()
+                            else:
+                                # push instruction
+                                instQ.put(instVars)
 
-                    kStr += priority(writer, 1, "Raise priority while processing macs")
+                    else:
+                        if instruction == "v_fma_f32":
+                            kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
+                        else:
+                            kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
 
                     if macIdx == kernel["PerformanceWaitLocation"]:
+                        kStr += self.popAllInstructions(instruction, instQ, priority, writer)
                         kStr += "s_waitcnt lgkmcnt({PerformanceWaitCount}) // extra wait for performance{endLine}".format_map(vars)
                     if macIdx == kernel["PerformanceSyncLocation"]:
+                        kStr += self.popAllInstructions(instruction, instQ, priority, writer)
                         kStr += "s_barrier // extra barrier for performance{endLine}".format_map(vars)
                     macIdx += 1
 
+        kStr += self.popAllInstructions(instruction, instQ, priority, writer)
         kStr += priority(writer, 0, "Reset priority after macs")
 
         return kStr
+
+    def popAllInstructions(self, inst, instructionQueue, priority, writer):
+        # pop all instructions
+        kStr = ""
+        while instructionQueue.qsize() > 0:
+            prevVars = instructionQueue.get()
+            if inst == "v_fma_f32":
+                kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(prevVars)
+            else:
+                kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(prevVars)
+            kStr += priority(writer, 1, "Raise priority while processing macs")
+        return kStr
+
+    def isLegal(self, instVars0, instVars1):
+        # VPOD has some restructions.
+        # For avoiding VGPR source-cache port limits, guarantee at least 1 duplicated SRC.
+        if instVars0["cStr"] == instVars1["cStr"]:
+            return False
+        if instVars0["a"] == instVars1["a"] or instVars0["b"] == instVars1["b"]:
+            return True
+        return False
@@ -0,0 +1,95 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch
+
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  BoundsCheck: True
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      DestDataType: s
+      TransposeA: False
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 1, 11 ]
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+          - [ 13, 3 ]
+        - WorkGroup:
+          - [ 32,  4,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  8,  4 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - WavefrontSize: [32, 64]
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [False]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+          - [  4,  4,  4 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+
@@ -0,0 +1,94 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch
+
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  BoundsCheck: True
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm NT
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      DestDataType: s
+      TransposeA: False
+      TransposeB: True
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 1, 11 ]
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+          - [ 13, 3 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8, 16,  1 ]
+          - [  2,  8,  8 ]
+        - DepthU: [-3]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - WavefrontSize: [32, 64]
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [True]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 16,  8,  1 ]
+          - [ 16,  2,  8 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
@@ -0,0 +1,92 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch
+
+# benchmark assembly and source kernels
+GlobalParameters:
+  MinimumRequiredVersion: 4.2.0
+  CMakeBuildType: Release
+  PrintLevel: 1
+  ForceRedoBenchmarkProblems: True
+  ForceRedoLibraryLogic: True
+  ForceRedoLibraryClient: True
+  EnqueuesPerSync: 1
+  SyncsPerBenchmark: 1
+  NumElementsToValidate: -1
+  BoundsCheck: True
+  ValidationMaxToPrint: 4
+  ValidationPrintValids: False
+  ShortNames: False
+  MergeFiles: True
+  DataInitTypeAB: 3
+  DataInitTypeC: 3
+  KernelTime: True
+
+BenchmarkProblems:
+
+  - # sgemm TN
+    - # ProblemType
+      OperationType: GEMM
+      DataType: s
+      DestDataType: s
+      TransposeA: True
+      TransposeB: False
+      UseBeta: True
+      Batched: True
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - WavefrontSize: [32]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 1, 11 ]
+          - [ 3, 5 ]
+          - [ 4, 8 ]
+          - [ 8, 8 ]
+          - [ 13, 3 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [ 32,  4,  1 ]
+        - DepthU: [-4]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+    - # BenchmarkProblemSizeGroup - Assembly
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - LoopTail: [True]
+        - EdgeType: ["ShiftPtr"]
+      ForkParameters:
+        - KernelLanguage: ["Assembly"]
+        - GlobalSplitU: [1, 3]
+        - PrefetchLocalRead: [True]
+        - PrefetchGlobalRead: [False]
+        - ThreadTile:
+          - [ 3, 3 ]
+          - [ 4, 4 ]
+          - [ 5, 5 ]
+          - [ 8, 8 ]
+        - WorkGroup:
+          - [ 16, 16,  1 ]
+          - [  8,  8,  1 ]
+        - DepthU: [-1]
+        - VectorWidth: [-1]
+      BenchmarkForkParameters:
+      JoinParameters:
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Range: [ [127,1,129], 0, [2], [63,1,65] ]
+
+