In [1]:
import air.compiler.util

from air.mlir.dialects import func
from air.mlir.dialects import linalg
from air.mlir.ir import *
import air.mlir.passmanager

import sys

In [2]:
def matmul_on_tensors(m, n, k, dtype):
    module = Module.create()
    with InsertionPoint(module.body):
        @func.FuncOp.from_py_func(
            RankedTensorType.get((m, k), dtype), RankedTensorType.get((k, n), dtype),
            RankedTensorType.get((m, n), dtype))
        def matmul(lhs, rhs, out):
            linalg.matmul(lhs, rhs, outs=[out])
    return module

In [3]:
with air.mlir.ir.Context(), Location.unknown():

    air_module = matmul_on_tensors(512, 512, 512, BF16Type.get())
    
    # convert linalg on tensors to linalg on memrefs
    pm = air.mlir.passmanager.PassManager.parse(air.compiler.util.LINALG_TENSOR_TO_MEMREF_PIPELINE)
    pm.run(air_module)

    # tile and map to air
    pipeline = ",".join([
        "air-linalg-codegen{l1-tile-size=32,32,32 l1-promote=true l2-tile-size=64,64,64 l2-promote=true}",
        "air-par-to-herd{depth=1}",
        "air-copy-to-dma",
        "canonicalize", "cse",
    ])
    pm = air.mlir.passmanager.PassManager.parse(pipeline)
    pm.run(air_module)
    
    #print ("\nAIR Dialect Module\n")
    #print (air_module)

    # generate dependency information for runner
    pm = air.mlir.passmanager.PassManager.parse("air-dependency,canonicalize,cse")
    pm.run(air_module)

    print ("\nAIR Dialect Module (async)\n")
    print (air_module)


AIR Dialect Module (async)

#map = affine_map<()[s0] -> (s0 * 32)>
module {
  func.func @matmul(%arg0: memref<512x512xbf16>, %arg1: memref<512x512xbf16>, %arg2: memref<512x512xbf16>) {
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c64 = arith.constant 64 : index
    %c512 = arith.constant 512 : index
    %c0 = arith.constant 0 : index
    %asyncToken, %valOut = air.execute async  {
      %1 = memref.alloc() {alignment = 128 : i64} : memref<512x512xbf16>
      air.execute_terminator %1 : memref<512x512xbf16>
    } {id = 1 : i32} : (memref<512x512xbf16>)
    %asyncToken_0 = air.execute async [%asyncToken]  : (!air.async.token) {
      memref.copy %arg2, %valOut : memref<512x512xbf16> to memref<512x512xbf16>
      air.execute_terminator
    } {id = 2 : i32}
    %0 = scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c512, %c512) step (%c64, %c64) init (%asyncToken_0) -> !air.async.token {
      %1 = scf.for %arg5 = %c0 to %c512 step %c64 iter_args(%arg6 = %asyncTo

In [4]:
arch = {
  "clock": 1000000000,
  "cores": 1,
  "datatype": {
    "bytes": 2,
    "name": "fp16"
  },
  "devicename": "testdevice",
  "interfaces": [
    {
      "bytes_per_second": 100000000000,
      "dst": 1,
      "src": 0
    },
    {
      "bytes_per_second": 100000000000,
      "dst": 0,
      "src": 1
    },
    {
      "bytes_per_second": 100000000000,
      "dst": 2,
      "src": 0
    },
    {
      "bytes_per_second": 100000000000,
      "dst": 0,
      "src": 2
    },
    {
      "bytes_per_second": 100000000000,
      "dst": 2,
      "src": 1
    },
    {
      "bytes_per_second": 100000000000,
      "dst": 1,
      "src": 2
    }
  ],
  "kernels": {
    "linalg.copy": {
      "efficiency": 1,
      "name": "linalg.copy"
    },
    "linalg.fill": {
      "efficiency": 1,
      "name": "linalg.fill"
    },
    "linalg.matmul": {
      "efficiency": 1,
      "name": "linalg.matmul"
    }
  },
  "ops_per_core_per_cycle": 512,
  "num_herd_slots": 1,
  "num_dispatch_queues": 1
}


In [5]:
runner = air.compiler.util.Runner(arch)
trace = runner.run(air_module, "matmul")

Finished at time 408136


In [6]:
arch["num_herd_slots"] = 4
arch["num_dispatch_queues"] = 8
runner = air.compiler.util.Runner(arch)
trace = runner.run(air_module, "matmul")

Finished at time 50751


In [7]:
with open("trace.out", "w") as f:
   f.write(trace)

Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved.

SPDX-License-Identifier: MIT