Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dual mac instruction for gfx11. #1625

Merged
merged 5 commits into from
Dec 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions Tensile/Common.py

Large diffs are not rendered by default.

65 changes: 60 additions & 5 deletions Tensile/Components/MAC_F32.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from ..Component import Component, MAC
from ..DataType import DataType
import queue

class MAC_F32_Plain(MAC):
"""
Expand All @@ -47,6 +48,10 @@ def __call__(self, writer, m, innerUnroll):
else:
raise RuntimeError("FMA instruction specified but not supported on {}".format(kernel["ISA"]))

dualMacEnable = 0
if writer.asmCaps["v_dual_fmac_f32"] and kernel["WavefrontSize"] == 32:
nakajee marked this conversation as resolved.
Show resolved Hide resolved
dualMacEnable = 1

if not writer.asmCaps[instruction]:
raise RuntimeError("{} instruction specified but not supported on {}".format(instruction, kernel["ISA"]))

Expand All @@ -67,6 +72,8 @@ def __call__(self, writer, m, innerUnroll):
priority = Component.Priority.find(writer)
macIdx = 0

instQ = queue.Queue()

for iui in range(0, innerUnroll):
for idx1 in range(0, kernel["ThreadTile1"]):
hcman2 marked this conversation as resolved.
Show resolved Hide resolved
for idx0 in range(0, kernel["ThreadTile0"]):
Expand All @@ -80,19 +87,67 @@ def __call__(self, writer, m, innerUnroll):
vars["aStr"] = "v[vgprValuA_X{m}_I{iui} + {a}]".format_map(vars)
vars["bStr"] = "v[vgprValuB_X{m}_I{iui} + {b}]".format_map(vars)

if instruction == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
else:
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
if dualMacEnable == 1:
instVars = {}
nakajee marked this conversation as resolved.
Show resolved Hide resolved
instVars["endLine"] = writer.endLine
instVars["cStr"] = vars["cStr"]
instVars["aStr"] = vars["aStr"]
instVars["bStr"] = vars["bStr"]
instVars["a"] = vars["a"]
instVars["b"] = vars["b"]
instVars["instruction"] = instruction

if instQ.empty():
instQ.put(instVars)
else:
# pop instruction
prevVars = instQ.queue[0]

if self.isLegal(instVars, prevVars):
# make dual fmac
kStr += "v_dual_fmac_f32 {cStr}, {aStr}, {bStr}".format_map(prevVars) + " :: v_dual_fmac_f32 {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)
kStr += priority(writer, 1, "Raise priority while processing macs")
instQ.get()
else:
# push instruction
instQ.put(instVars)

kStr += priority(writer, 1, "Raise priority while processing macs")
else:
if instruction == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(vars)
else:
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(vars)

if macIdx == kernel["PerformanceWaitLocation"]:
kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += "s_waitcnt lgkmcnt({PerformanceWaitCount}) // extra wait for performance{endLine}".format_map(vars)
if macIdx == kernel["PerformanceSyncLocation"]:
kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += "s_barrier // extra barrier for performance{endLine}".format_map(vars)
macIdx += 1

kStr += self.popAllInstructions(instruction, instQ, priority, writer)
kStr += priority(writer, 0, "Reset priority after macs")

return kStr

def popAllInstructions(self, inst, instructionQueue, priority, writer):
# pop all instructions
kStr = ""
while instructionQueue.qsize() > 0:
prevVars = instructionQueue.get()
if inst == "v_fma_f32":
kStr += "v_fma_f32 {cStr}, {aStr}, {bStr}, {cStr}{endLine}".format_map(prevVars)
else:
kStr += "{instruction} {cStr}, {aStr}, {bStr}{endLine}".format_map(prevVars)
kStr += priority(writer, 1, "Raise priority while processing macs")
return kStr

def isLegal(self, instVars0, instVars1):
# VPOD has some restructions.
# For avoiding VGPR source-cache port limits, guarantee at least 1 duplicated SRC.
if instVars0["cStr"] == instVars1["cStr"]:
return False
if instVars0["a"] == instVars1["a"] or instVars0["b"] == instVars1["b"]:
return True
return False
95 changes: 95 additions & 0 deletions Tensile/Tests/pre_checkin/wave32/sgemm_asm_nn_wv32.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch

# benchmark assembly and source kernels
GlobalParameters:
MinimumRequiredVersion: 4.2.0
CMakeBuildType: Release
PrintLevel: 1
ForceRedoBenchmarkProblems: True
ForceRedoLibraryLogic: True
ForceRedoLibraryClient: True
EnqueuesPerSync: 1
SyncsPerBenchmark: 1
NumElementsToValidate: -1
BoundsCheck: True
ValidationMaxToPrint: 4
ValidationPrintValids: False
ShortNames: False
MergeFiles: True
DataInitTypeAB: 3
DataInitTypeC: 3
KernelTime: True

BenchmarkProblems:

- # sgemm NN
- # ProblemType
OperationType: GEMM
DataType: s
DestDataType: s
TransposeA: False
TransposeB: False
UseBeta: True
Batched: True

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
- KernelLanguage: ["Assembly"]
ForkParameters:
- WavefrontSize: [32]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [False]
- PrefetchGlobalRead: [True]
- ThreadTile:
- [ 1, 11 ]
- [ 3, 5 ]
- [ 4, 8 ]
- [ 8, 8 ]
- [ 13, 3 ]
- WorkGroup:
- [ 32, 4, 1 ]
- [ 8, 8, 1 ]
- [ 4, 8, 4 ]
- DepthU: [-3]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
ForkParameters:
- WavefrontSize: [32, 64]
- KernelLanguage: ["Assembly"]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [False]
- PrefetchGlobalRead: [True]
- ThreadTile:
- [ 3, 3 ]
- [ 4, 4 ]
- [ 5, 5 ]
- [ 8, 8 ]
- WorkGroup:
- [ 16, 16, 1 ]
- [ 8, 8, 1 ]
- [ 4, 4, 4 ]
- DepthU: [-1]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]


94 changes: 94 additions & 0 deletions Tensile/Tests/pre_checkin/wave32/sgemm_asm_nt_wv32.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch

# benchmark assembly and source kernels
GlobalParameters:
MinimumRequiredVersion: 4.2.0
CMakeBuildType: Release
PrintLevel: 1
ForceRedoBenchmarkProblems: True
ForceRedoLibraryLogic: True
ForceRedoLibraryClient: True
EnqueuesPerSync: 1
SyncsPerBenchmark: 1
NumElementsToValidate: -1
BoundsCheck: True
ValidationMaxToPrint: 4
ValidationPrintValids: False
ShortNames: False
MergeFiles: True
DataInitTypeAB: 3
DataInitTypeC: 3
KernelTime: True

BenchmarkProblems:

- # sgemm NT
- # ProblemType
OperationType: GEMM
DataType: s
DestDataType: s
TransposeA: False
TransposeB: True
UseBeta: True
Batched: True

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
- KernelLanguage: ["Assembly"]
ForkParameters:
- WavefrontSize: [32]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [True]
- PrefetchGlobalRead: [True]
- ThreadTile:
- [ 1, 11 ]
- [ 3, 5 ]
- [ 4, 8 ]
- [ 8, 8 ]
- [ 13, 3 ]
- WorkGroup:
- [ 16, 16, 1 ]
- [ 8, 16, 1 ]
- [ 2, 8, 8 ]
- DepthU: [-3]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
ForkParameters:
- WavefrontSize: [32, 64]
- KernelLanguage: ["Assembly"]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [True]
- PrefetchGlobalRead: [True]
- ThreadTile:
- [ 3, 3 ]
- [ 4, 4 ]
- [ 5, 5 ]
- [ 8, 8 ]
- WorkGroup:
- [ 16, 16, 1 ]
- [ 16, 8, 1 ]
- [ 16, 2, 8 ]
- DepthU: [-1]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]

92 changes: 92 additions & 0 deletions Tensile/Tests/pre_checkin/wave32/sgemm_asm_tn_wv32.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030] # not supported by arch

# benchmark assembly and source kernels
GlobalParameters:
MinimumRequiredVersion: 4.2.0
CMakeBuildType: Release
PrintLevel: 1
ForceRedoBenchmarkProblems: True
ForceRedoLibraryLogic: True
ForceRedoLibraryClient: True
EnqueuesPerSync: 1
SyncsPerBenchmark: 1
NumElementsToValidate: -1
BoundsCheck: True
ValidationMaxToPrint: 4
ValidationPrintValids: False
ShortNames: False
MergeFiles: True
DataInitTypeAB: 3
DataInitTypeC: 3
KernelTime: True

BenchmarkProblems:

- # sgemm TN
- # ProblemType
OperationType: GEMM
DataType: s
DestDataType: s
TransposeA: True
TransposeB: False
UseBeta: True
Batched: True

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
- KernelLanguage: ["Assembly"]
ForkParameters:
- WavefrontSize: [32]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [True]
- PrefetchGlobalRead: [False]
- ThreadTile:
- [ 1, 11 ]
- [ 3, 5 ]
- [ 4, 8 ]
- [ 8, 8 ]
- [ 13, 3 ]
- WorkGroup:
- [ 16, 16, 1 ]
- [ 32, 4, 1 ]
- DepthU: [-4]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]

- # BenchmarkProblemSizeGroup - Assembly
InitialSolutionParameters:
BenchmarkCommonParameters:
- LoopTail: [True]
- EdgeType: ["ShiftPtr"]
ForkParameters:
- KernelLanguage: ["Assembly"]
- GlobalSplitU: [1, 3]
- PrefetchLocalRead: [True]
- PrefetchGlobalRead: [False]
- ThreadTile:
- [ 3, 3 ]
- [ 4, 4 ]
- [ 5, 5 ]
- [ 8, 8 ]
- WorkGroup:
- [ 16, 16, 1 ]
- [ 8, 8, 1 ]
- DepthU: [-1]
- VectorWidth: [-1]
BenchmarkForkParameters:
JoinParameters:
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [ [127,1,129], 0, [2], [63,1,65] ]


Loading