In [1]:
import air.compiler.util

from air.mlir.dialects import func
from air.mlir.dialects import linalg
from air.mlir.ir import *
import air.mlir.passmanager

import sys

In [2]:
def matmul_on_tensors(m, n, k, dtype):
    module = Module.create()
    with InsertionPoint(module.body):
        @func.FuncOp.from_py_func(
            RankedTensorType.get((m, k), dtype), RankedTensorType.get((k, n), dtype),
            RankedTensorType.get((m, n), dtype))
        def matmul(lhs, rhs, out):
            linalg.matmul(lhs, rhs, outs=[out])
    return module

In [6]:
with air.mlir.ir.Context(), Location.unknown():

    air_module = matmul_on_tensors(512, 512, 512, BF16Type.get())
    
    # convert linalg on tensors to linalg on memrefs
    pm = air.mlir.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE)
    pm.run(air_module)

    # tile and map to air
    pipeline = "builtin.module("+",".join([
        "air-linalg-codegen{l2-tile-size=128,128,128 l2-promote=true l1-tile-size=32,32,32 l1-promote=true}",
        "canonicalize", "cse",
        "air-par-to-herd{depth=1}",
        "air-copy-to-dma",
        "air-par-to-launch{has-air-segment=true}",
        "canonicalize", "cse",
    ])+')'
    pm = air.mlir.passmanager.PassManager.parse(pipeline)
    pm.run(air_module)
    
    print ("\nAIR Dialect Module\n")
    print (air_module)

    # generate dependency information for runner
    pipeline = "builtin.module("+",".join([
        "air-dependency",
        "air-dependency-schedule-opt",
        "air-specialize-dma-broadcast",
        "air-dma-to-channel",
        "canonicalize", "cse",
        "air-dependency-canonicalize",
        "air-dependency-parse-graph{output-dir=dot_graphs/}",
        "canonicalize", "cse",
        "air-place-herds{num-rows=4 num-cols=4 row-anchor=0 col-anchor=0}",
        "air-label-scf-for-to-ping-pong",
        "air-ping-pong-transform"
    ])+')'
    pm = air.mlir.passmanager.PassManager.parse(pipeline)
    pm.run(air_module)

    print ("\nAIR Dialect Module (async)\n")
    print (air_module)


AIR Dialect Module

#map = affine_map<()[s0] -> (s0 * 128)>
#map1 = affine_map<()[s0] -> (s0 * 32)>
module {
  func.func @matmul(%arg0: memref<512x512xbf16>, %arg1: memref<512x512xbf16>, %arg2: memref<512x512xbf16>) {
    %c4 = arith.constant 4 : index
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
    memref.copy %arg2, %alloc : memref<512x512xbf16> to memref<512x512xbf16>
    air.launch (%arg3, %arg4) in (%arg5=%c4, %arg6=%c4) args(%arg7=%arg0, %arg8=%arg1, %arg9=%alloc) : memref<512x512xbf16>, memref<512x512xbf16>, memref<512x512xbf16> {
      air.segment  args(%arg10=%arg3, %arg11=%arg4, %arg12=%arg7, %arg13=%arg8, %arg14=%arg9) : index, index, memref<512x512xbf16>, memref<512x512xbf16>, memref<512x512xbf16> {
        %c1 = arith.constant 1 : index
        %c4_0 = arith.constant 4 : index
        %c0 = arith.constant 0 : index
        %c512 = arith.constant 512 : index
        %c128 = arith.constant 128 : index
        %0 = affine.apply #map()[%arg10]
  

In [7]:
arch = {
    "clock": 1000000000,
    "cores": 1,
    "datatypes": [
        {
        "bytes": 1,
        "name": "i8"
        },
        {
        "bytes": 2,
        "name": "bf16"
        },
        {
        "bytes": 4,
        "name": "i32"
        }
    ],
    "devicename": "testdevice",
    "kernels": {
        "linalg.copy": {
            "datatypes": {
                "i8": {
                    "ops_per_core_per_cycle": 32,
                    "efficiency": 1
                },
                "bf16": {
                    "ops_per_core_per_cycle": 32,
                    "efficiency": 1
                },
                "i32": {
                    "ops_per_core_per_cycle": 16,
                    "efficiency": 1
                }
            },
            "name": "linalg.copy"
        },
        "linalg.fill": {
            "datatypes": {
                "i8": {
                    "ops_per_core_per_cycle": 32,
                    "efficiency": 1
                },
                "bf16": {
                    "ops_per_core_per_cycle": 32,
                    "efficiency": 1
                },
                "i32": {
                    "ops_per_core_per_cycle": 16,
                    "efficiency": 1
                }
            },
            "name": "linalg.fill"
        },
        "linalg.generic": {
            "datatypes": {
                "i8": {
                    "ops_per_core_per_cycle": 1,
                    "efficiency": 1
                },
                "bf16": {
                    "ops_per_core_per_cycle": 1,
                    "efficiency": 1
                },
                "i32": {
                    "ops_per_core_per_cycle": 1,
                    "efficiency": 1
                }
            },
            "name": "linalg.generic"
        },
        "linalg.matmul": {
            "datatypes": {
                "i8": {
                    "macs_per_core_per_cycle": 256,
                    "efficiency": 1
                },
                "bf16": {
                    "macs_per_core_per_cycle": 128,
                    "efficiency": 1
                },
                "i32": {
                    "macs_per_core_per_cycle": 32,
                    "efficiency": 1
                }
            },
            "name": "linalg.matmul"
        }
    },
    "dus": {
        "count": [4, 4],
        "memory": {
            "memory_space": "L2",
            "bytes": 524288
        },
        "ports": {
            "outbound": {
                "count": 6,
                "bytes_per_second": 4000000000
            },
            "inbound": {
                "count": 6,
                "bytes_per_second": 4000000000
            }
        },
        "tiles": {
            "count": [1, 4],
            "memory": {
                "memory_space": "L1",
                "bytes": 65536
            },
            "ports": {
                "outbound": {
                    "count": 2,
                    "bytes_per_second": 4000000000
                },
                "inbound": {
                    "count": 2,
                    "bytes_per_second": 4000000000
                }
            }
        }
    },
    "noc": {
        "outbound": {
            "count": 4,
            "bytes_per_second": 4000000000
        },
        "inbound": {
            "count": 4,
            "bytes_per_second": 4000000000
        }
    }
  }


In [5]:
runner = air.compiler.util.Runner(arch, "trace.out", "core")
trace = runner.run(air_module, "matmul")

Latency: 831.857us


Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

SPDX-License-Identifier: MIT