Merge branch 'master' into component-model/parsing

WasmEdge · Nov 3, 2023 · 18cc6ee · 18cc6ee
2 parents dc72f71 + f7931f2
commit 18cc6ee
Show file tree

Hide file tree

Showing 27 changed files with 13,844 additions and 1,329 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -62,6 +62,7 @@ option(WASMEDGE_LINK_TOOLS_STATIC "Statically link the wasmedge and wasmedgec to
 option(WASMEDGE_ENABLE_UB_SANITIZER "Enable undefined behavior sanitizer." OFF)
 set(WASMEDGE_PLUGIN_WASI_NN_BACKEND "" CACHE STRING "Enable WasmEdge Wasi-NN plugin with backends.")
 option(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_BLAS "Enable LLAMA_BLAS in the WASI-NN GGML backend" ON)
+option(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_CUBLAS "Enable LLAMA_CUBLAS in the WASI-NN GGML backend" OFF)
 option(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_METAL "Enable LLAMA_METAL in the WASI-NN GGML backend" OFF)
 # Currently supported WASI-NN backend: "OpenVINO" on Linux x86_64
 option(WASMEDGE_PLUGIN_WASI_CRYPTO "Enable WasmEdge Wasi-crypto plugin." OFF)

diff --git a/LICENSE.spdx b/LICENSE.spdx
@@ -39,7 +39,7 @@ PackageFileName: ./plugins/wasi_nn/thirdparty/ggml
 PackageHomePage: https://github.com/ggerganov/llama.cpp
 PackageOriginator: Georgi Gerganov
 PackageLicenseDeclared: MIT
-PackageDownloadLocation: git://github.com/ggerganov/llama.cpp.git@b1309
+PackageDownloadLocation: git://github.com/ggerganov/llama.cpp.git@b1383
 
 ## Relationships
 Relationship: SPDXRef-wasmedge CONTAINS SPDXRef-blake3

diff --git a/cmake/Helper.cmake b/cmake/Helper.cmake
@@ -35,10 +35,15 @@ else()
   list(APPEND WASMEDGE_CFLAGS
     -Wall
     -Wextra
-    -Werror
-    -Wno-error=pedantic
   )
 
+  if(NOT WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_CUBLAS)
+    list(APPEND WASMEDGE_CFLAGS
+       -Werror
+       -Wno-error=pedantic
+    )
+  endif()
+
   if(WASMEDGE_ENABLE_UB_SANITIZER)
     list(APPEND WASMEDGE_CFLAGS -fsanitize=undefined)
   endif()

diff --git a/plugins/wasi_nn/CMakeLists.txt b/plugins/wasi_nn/CMakeLists.txt
@@ -6,6 +6,16 @@
 set(LLAMA_ALL_WARNINGS OFF)
 set(LLAMA_METAL_NDEBUG ON)
 
+if(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_CUBLAS)
+  message(STATUS "WASI-NN GGML LLAMA backend: Enable LLAMA_CUBLAS")
+  set(LLAMA_CUBLAS ON)
+  # If CUBLAS is ON, then OpenBLAS should be OFF.
+  set(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_BLAS OFF)
+else()
+  message(STATUS "WASI-NN GGML LLAMA backend: Disable LLAMA_CUBLAS")
+  set(LLAMA_CUBLAS OFF)
+endif()
+
 if(WASMEDGE_PLUGIN_WASI_NN_GGML_LLAMA_BLAS)
   message(STATUS "WASI-NN GGML LLAMA backend: Enable LLAMA_BLAS")
   # Default use OpenBLAS

diff --git a/plugins/wasi_nn/ggml.cpp b/plugins/wasi_nn/ggml.cpp
@@ -54,8 +54,9 @@ Expect<ErrNo> load(WasiNNEnvironment &Env, Span<const Span<uint8_t>> Builders,
   gpt_params Params;
   llama_backend_init(Params.numa);
   llama_model_params ModelParams = llama_model_default_params();
+  GraphRef.ModelFilePath = ModelFilePath;
   GraphRef.LlamaModel =
-      llama_load_model_from_file(ModelFilePath.c_str(), ModelParams);
+      llama_load_model_from_file(GraphRef.ModelFilePath.c_str(), ModelParams);
   if (GraphRef.LlamaModel == nullptr) {
     spdlog::error("[WASI-NN] GGML backend: Error: unable to init model."sv);
     Env.NNGraph.pop_back();
@@ -116,6 +117,7 @@ Expect<ErrNo> setInput(WasiNNEnvironment &Env, uint32_t ContextId,
             "[WASI-NN] GGML backend: Unable to retrieve the enable-log option."sv);
         return ErrNo::InvalidArgument;
       }
+      llama_log_set(nullptr, &CxtRef.EnableLog);
     }
     if (Doc.at_key("stream-stdout").error() == simdjson::SUCCESS) {
       auto Err = Doc["stream-stdout"].get<bool>().get(CxtRef.StreamStdout);
@@ -176,6 +178,28 @@ Expect<ErrNo> setInput(WasiNNEnvironment &Env, uint32_t ContextId,
   llama_context_params ContextParams = llama_context_default_params();
   ContextParams.n_ctx = CxtRef.CtxSize;
   ContextParams.n_batch = CxtRef.BatchSize;
+
+  // XXX: Due to the limitation of WASI-NN proposal,
+  // we have no way to pass the metadata before the setInput phase
+  // when we want to do some configurations in the load phase.
+  // That's why we have this hack.
+  {
+    llama_model_params ModelParams = llama_model_default_params();
+    // If the `n_gpu_layers` in `setInput` is different from the
+    // `n_gpu_layers` in `llama_model_params`, we will reload
+    // the model with the new configuration.
+    if (ModelParams.n_gpu_layers != static_cast<int32_t>(CxtRef.NGPULayers)) {
+      ModelParams.n_gpu_layers = CxtRef.NGPULayers;
+      GraphRef.LlamaModel = llama_load_model_from_file(
+          GraphRef.ModelFilePath.c_str(), ModelParams);
+      if (GraphRef.LlamaModel == nullptr) {
+        spdlog::error("[WASI-NN] GGML backend: Error: unable to init model."sv);
+        Env.NNGraph.pop_back();
+        return ErrNo::InvalidArgument;
+      }
+    }
+  }
+
   GraphRef.LlamaContext =
       llama_new_context_with_model(GraphRef.LlamaModel, ContextParams);
 

diff --git a/plugins/wasi_nn/ggml.h b/plugins/wasi_nn/ggml.h
@@ -18,8 +18,9 @@ namespace WasmEdge::Host::WASINN::GGML {
 
 #ifdef WASMEDGE_PLUGIN_WASI_NN_BACKEND_GGML
 struct Graph {
-  llama_model *LlamaModel;
-  llama_context *LlamaContext;
+  llama_model *LlamaModel = nullptr;
+  llama_context *LlamaContext = nullptr;
+  std::string ModelFilePath;
 };
 
 struct Context {

diff --git a/plugins/wasi_nn/thirdparty/ggml/CMakeLists.txt b/plugins/wasi_nn/thirdparty/ggml/CMakeLists.txt
@@ -24,6 +24,12 @@ option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"
 option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
 
 # instruction set specific
+if (LLAMA_NATIVE)
+    set(INS_ENB OFF)
+else()
+    set(INS_ENB ON)
+endif()
+
 option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
@@ -89,6 +95,20 @@ if (NOT MSVC)
     endif()
 endif()
 
+if (APPLE AND LLAMA_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        add_compile_definitions(GGML_USE_ACCELERATE)
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
 if (LLAMA_METAL)
     find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
     find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
@@ -335,8 +355,7 @@ endif()
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
-            -Werror=implicit-function-declaration)
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
         set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
         set(host_cxx_flags "")
 
@@ -368,7 +387,8 @@ if (LLAMA_ALL_WARNINGS)
     set(c_flags   ${c_flags}   ${warning_flags})
     set(cxx_flags ${cxx_flags} ${warning_flags})
     add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
-                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
+                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
 
 endif()
 
@@ -423,9 +443,6 @@ if (NOT MSVC)
     if (LLAMA_GPROF)
         add_compile_options(-pg)
     endif()
-    if (LLAMA_NATIVE)
-        add_compile_options(-march=native)
-    endif()
 endif()
 
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
@@ -480,6 +497,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
             add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
         endif()
     else()
+        if (LLAMA_NATIVE)
+            add_compile_options(-march=native)
+        endif()
         if (LLAMA_F16C)
             add_compile_options(-mf16c)
         endif()
@@ -576,8 +596,12 @@ wasmedge_add_library(ggml OBJECT
                      ggml.h
                      ggml-alloc.c
                      ggml-alloc.h
+                     ggml-backend.c
+                     ggml-backend.h
                      common.cpp
                      common.h
+                     sampling.cpp
+                     sampling.h
                      ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
                      ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
                      ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
@@ -622,14 +646,6 @@ if (BUILD_SHARED_LIBS)
     endif()
 endif()
 
-# global flags for ggml
-if (NOT WIN32)
-    target_compile_options(ggml
-        PRIVATE
-        -DGGML_USE_K_QUANTS
-    )
-endif()
-
 # disable warnings
 if (NOT WIN32)
     target_compile_options(ggml