## Prepare
获取硬件信息，如内存大小，缓存级别

In [None]:
# from mlir.ir import Context, Module
import ctypes
import numpy as np
import gc, sys, os, tempfile
# from mlir import ir
from mlir.ir import *
from mlir.dialects import builtin
from mlir.dialects import func
from mlir.dialects import linalg
from mlir.passmanager import *
# from mlir import runtime as rt
from mlir.runtime import *
# from mlir import execution_engine
from mlir.execution_engine import *
from mlir.dialects.linalg.opdsl import lang as dsl

import mlir.dialects.gpu
import mlir.dialects.gpu.passes

In [None]:
def toGPU(module):
  pm = PassManager()
  # pm.add('gpu-kernel-outlining')
  pm.add("func.func(convert-linalg-to-parallel-loops)")
  # # pm.add("test-gpu-greedy-parallel-loop-mapping")
  pm.add("convert-parallel-loops-to-gpu")
  pm.add("gpu-kernel-outlining")
  pm.add("func.func(lower-affine)")
  pm.add("func.func(convert-scf-to-cf)")
  pm.add("func.func(arith-expand)")
  pm.add("func.func(memref-expand)")
  pm.add("convert-vector-to-llvm")
  pm.add("finalize-memref-to-llvm")
  pm.add("func.func(canonicalize)")
  # print(pm)
  pm.run(module)
  # print(module)

  pm = PassManager.parse('builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))')
  # print(pm)
  pm.run(module)
  # print(module)

  pm = PassManager()
  pm.add("gpu-to-llvm")
  # print(pm)
  pm.run(module)
  # print(module)
  return module

def transform(module):
  # TODO: Allow cloning functions from one module to another.
  # Atm we have to resort to string concatenation.

  pm = PassManager('builtin.module')
  pm.add("func.func(convert-linalg-to-loops)")
  pm.add("func.func(lower-affine)")
  pm.add("func.func(convert-math-to-llvm)")
  pm.add("func.func(convert-scf-to-cf)")
  pm.add("func.func(arith-expand)")
  pm.add("func.func(memref-expand)")
  pm.add("convert-vector-to-llvm")
  pm.add("finalize-memref-to-llvm")
  pm.add("convert-func-to-llvm")
  pm.add("reconcile-unrealized-casts")
  pm.run(module)
  return module

def run(f):
  print("\nTEST:", f.__name__)
  f()
  gc.collect()
  assert Context._get_live_count() == 0
  return f

## 初始


In [11]:
M=10000
K=500
N=1000
matmul_expr = r"""
  module  {
    func.func @matmul(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>) attributes {llvm.emit_c_interface} {
      linalg.matmul 
        ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
       outs(%c:memref<?x?xf32>)
      return
      }
  }
"""

### 测试matmul

In [None]:
def testMatmul():
  with Context() as ctx:
    # IR construction using `ctx` as context.

    # For example, parsing an MLIR module from string requires the context.
    module = Module.parse(matmul_expr)
    
 
    arg1 = np.random.randn(M, K).astype(np.float32)
    arg2 = np.random.randn(K, N).astype(np.float32)
    res = np.zeros((M, N), dtype=arg1.dtype)

    arg1_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(arg1)))
    arg2_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(arg2)))
    res_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(res)))
    # print(transform(module))
    execution_engine = ExecutionEngine(transform(module))
    try:
        execution_engine.invoke("matmul", arg1_memref_ptr, arg2_memref_ptr,
                                res_memref_ptr)
    except RuntimeError as e:
        print("Got RuntimeError: ", e)
    # print(res)
    
    npout = ranked_memref_to_numpy(res_memref_ptr[0])

    print(npout)

    print(np.allclose(np.matmul(arg1, arg2), res))

testMatmul()

在CPU上
8m46.4s
运行环境受限，无法指定在GPU上运行
引入IREE

In [None]:
def testMatmulGPU():
  with Context() as ctx:
    # IR construction using `ctx` as context.

    # For example, parsing an MLIR module from string requires the context.
    module = Module.parse(matmul_expr)
    
 
    arg1 = np.random.randn(M, K).astype(np.float32)
    arg2 = np.random.randn(K, N).astype(np.float32)
    res = np.zeros((M, N), dtype=arg1.dtype)

    arg1_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(arg1)))
    arg2_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(arg2)))
    res_memref_ptr = ctypes.pointer(
        ctypes.pointer(get_ranked_memref_descriptor(res)))
    # print(transform(module))
    # print(toGPU(module))

    shared_libs = [
          "../../../../llvm-project/build/lib/libmlir_runner_utils.so",
          "../../../../llvm-project/build/lib/libmlir_c_runner_utils.so",
          "../../../../llvm-project/build/lib/libmlir_cuda_runtime.so"
      ]

    execution_engine = ExecutionEngine(toGPU(module),opt_level=3,shared_libs=shared_libs)
    try:
        execution_engine.invoke("matmul", arg1_memref_ptr, arg2_memref_ptr,
                                res_memref_ptr)
    except RuntimeError as e:
        print("Got RuntimeError: ", e)
    # print(res)
    
    # npout = ranked_memref_to_numpy(res_memref_ptr[0])

    # print(npout)

    # print(np.allclose(np.matmul(arg1, arg2), res))

testMatmulGPU()

引入IREE

In [13]:
import contextlib
import logging
import os
import io
import tempfile
import unittest

import iree.compiler.tools

from iree.compiler import ir
from iree.compiler import passmanager
from iree.compiler.transforms import ireec
# The compiler re-exports API access to a number of dialects. If one of these
# fails to import, it indicates a build issue.
from iree.compiler.dialects import arith
#from iree.compiler.dialects import chlo
#from iree.compiler.dialects import mhlo
from iree.compiler.dialects import iree_input
from iree.compiler.dialects import builtin
from iree.compiler.dialects import linalg
from iree.compiler.dialects import math
from iree.compiler.dialects import memref
from iree.compiler.dialects import pdl
from iree.compiler.dialects import shape
from iree.compiler.dialects import tensor
from iree.compiler.dialects import tosa
from iree.compiler.dialects import vector

In [18]:
def testExtraArgsStderr():
    # mlir-timing is not special: it just does something and emits to stderr.
    with io.StringIO() as buf, contextlib.redirect_stderr(buf):
      text = iree.compiler.tools.compile_str(
          matmul_expr,
          output_format=iree.compiler.tools.OutputFormat.MLIR_TEXT,
          extra_args=["--mlir-timing"],
          target_backends=iree.compiler.tools.DEFAULT_TESTING_BACKENDS).decode(
            "utf-8")
      stderr = buf.getvalue()
    # print(stderr)
    print(text)

testExtraArgsStderr()

vm.module public @module attributes {ordinal_counts = #vm.ordinal_counts<import_funcs = 0, export_funcs = 1, internal_funcs = 1, global_bytes = 0, global_refs = 0, rodatas = 0, rwdatas = 0>} {
  vm.func private @matmul(%arg0: !vm.ref<memref<?x?xf32>> loc(unknown), %arg1: !vm.ref<memref<?x?xf32>> loc(unknown), %arg2: !vm.ref<memref<?x?xf32>> loc(unknown)) attributes {ordinal = 0 : i32} {
    vm.return loc("<stdin>":3:5)
  } loc("<stdin>":3:5)
  vm.export @matmul attributes {iree.abi.stub, ordinal = 0 : i32} loc("<stdin>":3:5)
} loc("<stdin>":2:3)



In [None]:
# export LLVM_INSTALL_DIR=/work/shared/common/llvm-project-gpu
# export LD_LIBRARY_PATH=$LLVM_INSTALL_DIR/build/lib:$LD_LIBRARY_PATH
# export LD_LIBRARY_PATH=/work/shared/common/usr/local/lib:/work/shared/common/usr/local/lib64:$LD_LIBRARY_PATH
# export PATH=/work/shared/common/llvm-project-gpu/build/bin/:$PATH

# mlir-opt matmul.mlir \
#     --convert-linalg-to-parallel-loops \
#     --test-gpu-greedy-parallel-loop-mapping \
#     --convert-parallel-loops-to-gpu \
#     --gpu-kernel-outlining \
#     --lower-affine \
#     --convert-scf-to-std \
#     --canonicalize \
#     --pass-pipeline="gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin)" --gpu-to-llvm > matmul.mlir.llvm
# mlir-translate matmul.mlir.llvm -mlir-to-llvmir > matmul.ll 
# opt matmul.ll -O3 -S | llc -O3 -o matmul.s 
# as -o matmul.o matmul.s 
# clang++ matmul.o -L$LLVM_INSTALL_DIR/build/lib -o exec -lcuda -lmlir_cuda_runtime -lmlir_runner_utils -lmlir_c_runner_utils
# ./exec

下降到 affine