SciSharp · SignalRT · Sep 27, 2025 · Sep 29, 2025 · Sep 29, 2025 · Oct 5, 2025
diff --git a/LLama.Examples/ExampleRunner.cs b/LLama.Examples/ExampleRunner.cs
@@ -15,7 +15,7 @@ public class ExampleRunner
         { "Chat Session: Automatic conversation", TalkToYourself.Run },
         { "Chat Session: Chinese characters", ChatChineseGB2312.Run },
         { "Executor: Interactive mode chat", InteractiveModeExecute.Run },
-        { "Executor: Llava Interactive mode chat", LlavaInteractiveModeExecute.Run },
+        { "Executor: Mtmd Interactive mode chat", MtmdInteractiveModeExecute.Run },
         { "Executor: Instruct mode chat", InstructModeExecute.Run },
         { "Executor: Stateless mode chat", StatelessModeExecute.Run },
         { "Save and Load: chat session", SaveAndLoadSession.Run },
@@ -33,7 +33,7 @@ public class ExampleRunner
         { "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
         { "Batched Executor: Fork", BatchedExecutorFork.Run },
         { "Batched Executor: Rewind", BatchedExecutorRewind.Run },
-        { "Batched Executor: LLava", BatchedExecutorLLava.Run },
+        { "Batched Executor: Mtmd", BatchedExecutorMtmd.Run },
         { "Batched Executor: BoolQ Benchmark", BatchedExecutorBoolQ.Run },
         { "Batched Executor: Beam Search", BatchedExecutorBeamSearch.Run },
         { "Custom Sampling Pipeline", CustomSampler.Run },

diff --git a/LLama.Examples/Examples/BatchedExecutorLLava.cs b/LLama.Examples/Examples/BatchedExecutorLLava.cs
diff --git a/LLama.Examples/Examples/BatchedExecutorMtmd.cs b/LLama.Examples/Examples/BatchedExecutorMtmd.cs
@@ -0,0 +1,126 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using LLama.Batched;
+using LLama.Common;
+using LLama.Exceptions;
+using LLama.Native;
+using LLama.Sampling;
+using Spectre.Console;
+
+namespace LLama.Examples.Examples;
+
+/// <summary>
+/// Demonstrates how to evaluate an image with MTMD helpers and continue generation by
+/// manually scheduling batches, similar to what the batched executor does internally.
+/// </summary>
+public class BatchedExecutorMtmd
+{
+    /// <summary>
+    /// Number of completion tokens to generate after sending the image prompt.
+    /// </summary>
+    public const int TokenCount = 10000;
+
+    public static async Task Run()
+    {
+        // Load the base LLM and its clip/mtmd sidecar weights so the executor has everything it needs.
+        var parameters = new ModelParams(UserSettings.GetModelPath());
+        using var model = await LLamaWeights.LoadFromFileAsync(parameters);
+        var mtmdParams = MtmdContextParams.Default(); // reuse llama.cpp defaults for helper settings
+        mtmdParams.UseGpu = false;
+        var marker = mtmdParams.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";
+
+        using var mtmd = await SafeMtmdWeights.LoadFromFileAsync(UserSettings.GetMMProjPath(), model, mtmdParams); // multimodal helper weights
+
+        using var executor = new BatchedExecutor(model, parameters, mtmd); // drives batched token + chunk evaluation
+
+        // Prepend the media marker so the helper knows where to inject the encoded image tokens.
+        var defaultPrompt = "\nUSER: Provide a full description of the image.\nASSISTANT: ";
+        var promptSuffix = AnsiConsole.Ask("Prompt (or ENTER for default):", defaultPrompt);
+        var promptText = string.Concat(marker, promptSuffix);
+
+        var imagePath = UserSettings.GetImagePath();
+        AnsiConsole.Write(new CanvasImage(imagePath));
+
+        var vocab = executor.Context.NativeHandle.ModelHandle.Vocab;
+
+        // Simple low-temperature sampler keeps the demo deterministic-ish.
+        var sampler = new DefaultSamplingPipeline
+        {
+            Temperature = 0.1f
+        };
+
+        // Stream decoded text to the console as soon as tokens arrive.
+        var decoder = new StreamingTokenDecoder(executor.Context)
+        {
+            DecodeSpecialTokens = false
+        };
+
+        try
+        {
+            // Each conversation tracks its own KV cache sequence IDs.
+            var conversation = executor.Create();
+            // enqueue the image so MtmdHelper sees it
+            conversation.QueueMedia(imagePath); 
+            // schedule multimodal prompt
+            conversation.Prompt(promptText, addBos: true, special: true); 
+
+            Console.ForegroundColor = ConsoleColor.Yellow;
+            Console.WriteLine("Prompt queued with multimodal chunks. Generating response...\n");
+            Console.ResetColor();
+
+            var remaining = TokenCount;
+
+            // Run one decode/sampling/prompt cycle – mirrors the batched executor inner loop.
+            async Task<bool> ProcessNextAsync()
+            {
+                var decodeResult = await executor.Infer();
+                if (decodeResult == DecodeResult.NoKvSlot) // KV cache exhausted – surface to the user
+                {
+                    Console.ForegroundColor = ConsoleColor.Red;
+                    Console.WriteLine("Insufficient KV cache space for multimodal evaluation.");
+                    Console.ResetColor();
+                    return false;
+                }
+
+                if (decodeResult != DecodeResult.Ok)
+                    throw new RuntimeError($"Failed to evaluate batch: {decodeResult}.");
+
+                if (!conversation.RequiresSampling) // another conversation may still be queued
+                    return true;
+
+                var token = conversation.Sample(sampler); // pull logits (or -1 for mtmd chunk) and sample
+                if (token.IsEndOfGeneration(vocab))
+                    return false;
+
+                decoder.Add(token);
+                var delta = decoder.Read();
+                if (!string.IsNullOrEmpty(delta))
+                    Console.Write(delta);
+
+                sampler.Accept(token); // keep sampler state in sync
+                conversation.Prompt(token); // feed the accepted token back into the batch
+                remaining--;
+                return remaining > 0;
+            }
+
+            while (remaining > 0 && await ProcessNextAsync()) // continue until EOS or budget is reached
+            {
+            }
+
+            Console.WriteLine();
+        }
+        catch (IOException ex)
+        {
+            Console.ForegroundColor = ConsoleColor.Red;
+            Console.WriteLine($"Could not load media '{imagePath}': {ex.Message}");
+            Console.ResetColor();
+        }
+        catch (RuntimeError ex)
+        {
+            Console.ForegroundColor = ConsoleColor.Red;
+            Console.WriteLine($"MTMD processing failed: {ex.Message}");
+            Console.ResetColor();
+        }
+    }
+}
diff --git a/...s/Examples/LlavaInteractiveModeExecute.cs → ...es/Examples/MtmdInteractiveModeExecute.cs b/...s/Examples/LlavaInteractiveModeExecute.cs → ...es/Examples/MtmdInteractiveModeExecute.cs
@@ -1,3 +1,5 @@
+using System.Collections.Generic;
+using System.IO;
 using System.Text.RegularExpressions;
 using LLama.Common;
 using Spectre.Console;
@@ -6,27 +8,32 @@
 
 namespace LLama.Examples.Examples
 {
-    // This example shows how to chat with LLaVA model with both image and text as input.
+    // This example shows how to chat with Mtmd model with both image and text as input.
     // It uses the interactive executor to inference.
-    public class LlavaInteractiveModeExecute
+    public class MtmdInteractiveModeExecute
     {
         public static async Task Run()
         {
             string multiModalProj = UserSettings.GetMMProjPath();
             string modelPath = UserSettings.GetModelPath();
             string modelImage = UserSettings.GetImagePath();
-            const int maxTokens = 1024;
+            const int maxTokens = 2048;
 
             var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";
 
             var parameters = new ModelParams(modelPath);
 
+            var mtmdParameters = MtmdContextParams.Default();
+            mtmdParameters.UseGpu = false;
+
             using var model = await LLamaWeights.LoadFromFileAsync(parameters);
             using var context = model.CreateContext(parameters);
-
-            // Llava Init
-            using var clipModel = await LLavaWeights.LoadFromFileAsync(multiModalProj);
-
+
+            // Mtmd Init
+            using var clipModel = await SafeMtmdWeights.LoadFromFileAsync(multiModalProj, model, mtmdParameters );
+
+            var mediaMarker = mtmdParameters.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";
+
             var ex = new InteractiveExecutor(context, clipModel);
 
             Console.ForegroundColor = ConsoleColor.Yellow;
@@ -40,38 +47,61 @@ public static async Task Run()
                     Temperature = 0.1f
                 },
 
-                AntiPrompts = new List<string> { "\nUSER:" },
+                AntiPrompts = new List<string> { "\nASSISTANT:" },
                 MaxTokens = maxTokens
 
             };
 
             do
             {
 
-                // Evaluate if we have images
+                // Evaluate if we have media
                 //
-                var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
-                var imageCount = imageMatches.Count();
-                var hasImages = imageCount > 0;
+                var mediaMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+                var mediaCount = mediaMatches.Count();
+                var hasMedia = mediaCount > 0;
 
-                if (hasImages)
+                if (hasMedia)
                 {
-                    var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
-                    var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
+                    var mediaPathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
+                    var mediaPaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
 
-                    List<byte[]> imageBytes;
+                    var embeds = new List<SafeMtmdEmbed>();
+                    var imageList = new List<byte[]>();
+                    var imageExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
+                    {
+                        ".png",
+                        ".jpg",
+                        ".jpeg",
+                        ".bmp",
+                        ".gif",
+                        ".webp"
+                    };
+
                     try
                     {
-                        imageBytes = imagePaths.Select(File.ReadAllBytes).ToList();
+                        foreach (var mediaPath in mediaPaths)
+                        {
+                            var extension = Path.GetExtension(mediaPath);
+                            if (!string.IsNullOrEmpty(extension) && imageExtensions.Contains(extension))
+                            {
+                                // Keep the raw image data so the caller can reuse or inspect the images later.
+                                imageList.Add(File.ReadAllBytes(mediaPath));
+                            }
+
+                            var embed = clipModel.LoadMedia(mediaPath);
+                            embeds.Add(embed);
+                        }
                     }
                     catch (IOException exception)
                     {
                         Console.ForegroundColor = ConsoleColor.Red;
                         Console.Write(
-                            $"Could not load your {(imageCount == 1 ? "image" : "images")}:");
+                            $"Could not load your {(mediaCount == 1 ? "media" : "medias")}:");
                         Console.Write($"{exception.Message}");
                         Console.ForegroundColor = ConsoleColor.Yellow;
                         Console.WriteLine("Please try again.");
+                        clipModel.ClearMedia();
                         break;
                     }
 
@@ -81,19 +111,17 @@ public static async Task Run()
                     // https://github.com/ggerganov/llama.cpp/discussions/3620
                     ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );
 
-                    int index = 0;
-                    foreach (var path in imagePathsWithCurlyBraces)
+                    // Replace placeholders with media markers (one marker per image)
+                    foreach (var path in mediaPathsWithCurlyBraces)
                     {
-                        // First image replace to tag <image, the rest of the images delete the tag
-                        prompt = prompt.Replace(path, index++ == 0 ? "<image>" : "");
+                        prompt = prompt.Replace(path, mediaMarker, StringComparison.Ordinal);
                     }
 
-
                     Console.ForegroundColor = ConsoleColor.Yellow;
                     Console.WriteLine($"Here are the images, that are sent to the chat model in addition to your message.");
                     Console.WriteLine();
 
-                    foreach (var consoleImage in imageBytes?.Select(bytes => new CanvasImage(bytes)) ?? Array.Empty<CanvasImage>())
+                    foreach (var consoleImage in imageList.Select(image => new CanvasImage(image.ToArray())))
                     {
                         consoleImage.MaxWidth = 50;
                         AnsiConsole.Write(consoleImage);
@@ -108,10 +136,9 @@ public static async Task Run()
 
                     // Initialize Images in executor
                     //
-                    foreach (var image in imagePaths)
-                    {
-                        ex.Images.Add(await File.ReadAllBytesAsync(image));
-                    }
+                    ex.Embeds.Clear();
+                    foreach (var embed in embeds)
+                        ex.Embeds.Add(embed);
                 }
 
                 Console.ForegroundColor = Color.White;