Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions LLama.Examples/ExampleRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public class ExampleRunner
{ "Chat Session: Automatic conversation", TalkToYourself.Run },
{ "Chat Session: Chinese characters", ChatChineseGB2312.Run },
{ "Executor: Interactive mode chat", InteractiveModeExecute.Run },
{ "Executor: Llava Interactive mode chat", LlavaInteractiveModeExecute.Run },
{ "Executor: Mtmd Interactive mode chat", MtmdInteractiveModeExecute.Run },
{ "Executor: Instruct mode chat", InstructModeExecute.Run },
{ "Executor: Stateless mode chat", StatelessModeExecute.Run },
{ "Save and Load: chat session", SaveAndLoadSession.Run },
Expand All @@ -33,7 +33,7 @@ public class ExampleRunner
{ "Batched Executor: Save/Load", BatchedExecutorSaveAndLoad.Run },
{ "Batched Executor: Fork", BatchedExecutorFork.Run },
{ "Batched Executor: Rewind", BatchedExecutorRewind.Run },
{ "Batched Executor: LLava", BatchedExecutorLLava.Run },
{ "Batched Executor: Mtmd", BatchedExecutorMtmd.Run },
{ "Batched Executor: BoolQ Benchmark", BatchedExecutorBoolQ.Run },
{ "Batched Executor: Beam Search", BatchedExecutorBeamSearch.Run },
{ "Custom Sampling Pipeline", CustomSampler.Run },
Expand Down
91 changes: 0 additions & 91 deletions LLama.Examples/Examples/BatchedExecutorLLava.cs

This file was deleted.

126 changes: 126 additions & 0 deletions LLama.Examples/Examples/BatchedExecutorMtmd.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
using System;
using System.Collections.Generic;
using System.IO;
using LLama.Batched;
using LLama.Common;
using LLama.Exceptions;
using LLama.Native;
using LLama.Sampling;
using Spectre.Console;

namespace LLama.Examples.Examples;

/// <summary>
/// Demonstrates how to evaluate an image with MTMD helpers and continue generation by
/// manually scheduling batches, similar to what the batched executor does internally.
/// </summary>
public class BatchedExecutorMtmd
{
/// <summary>
/// Number of completion tokens to generate after sending the image prompt.
/// </summary>
public const int TokenCount = 10000;

public static async Task Run()
{
// Load the base LLM and its clip/mtmd sidecar weights so the executor has everything it needs.
var parameters = new ModelParams(UserSettings.GetModelPath());
using var model = await LLamaWeights.LoadFromFileAsync(parameters);
var mtmdParams = MtmdContextParams.Default(); // reuse llama.cpp defaults for helper settings
mtmdParams.UseGpu = false;
var marker = mtmdParams.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";

using var mtmd = await SafeMtmdWeights.LoadFromFileAsync(UserSettings.GetMMProjPath(), model, mtmdParams); // multimodal helper weights

using var executor = new BatchedExecutor(model, parameters, mtmd); // drives batched token + chunk evaluation

// Prepend the media marker so the helper knows where to inject the encoded image tokens.
var defaultPrompt = "\nUSER: Provide a full description of the image.\nASSISTANT: ";
var promptSuffix = AnsiConsole.Ask("Prompt (or ENTER for default):", defaultPrompt);
var promptText = string.Concat(marker, promptSuffix);

var imagePath = UserSettings.GetImagePath();
AnsiConsole.Write(new CanvasImage(imagePath));

var vocab = executor.Context.NativeHandle.ModelHandle.Vocab;

// Simple low-temperature sampler keeps the demo deterministic-ish.
var sampler = new DefaultSamplingPipeline
{
Temperature = 0.1f
};

// Stream decoded text to the console as soon as tokens arrive.
var decoder = new StreamingTokenDecoder(executor.Context)
{
DecodeSpecialTokens = false
};

try
{
// Each conversation tracks its own KV cache sequence IDs.
var conversation = executor.Create();
// enqueue the image so MtmdHelper sees it
conversation.QueueMedia(imagePath);
// schedule multimodal prompt
conversation.Prompt(promptText, addBos: true, special: true);

Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("Prompt queued with multimodal chunks. Generating response...\n");
Console.ResetColor();

var remaining = TokenCount;

// Run one decode/sampling/prompt cycle – mirrors the batched executor inner loop.
async Task<bool> ProcessNextAsync()
{
var decodeResult = await executor.Infer();
if (decodeResult == DecodeResult.NoKvSlot) // KV cache exhausted – surface to the user
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine("Insufficient KV cache space for multimodal evaluation.");
Console.ResetColor();
return false;
}

if (decodeResult != DecodeResult.Ok)
throw new RuntimeError($"Failed to evaluate batch: {decodeResult}.");

if (!conversation.RequiresSampling) // another conversation may still be queued
return true;

var token = conversation.Sample(sampler); // pull logits (or -1 for mtmd chunk) and sample
if (token.IsEndOfGeneration(vocab))
return false;

decoder.Add(token);
var delta = decoder.Read();
if (!string.IsNullOrEmpty(delta))
Console.Write(delta);

sampler.Accept(token); // keep sampler state in sync
conversation.Prompt(token); // feed the accepted token back into the batch
remaining--;
return remaining > 0;
}

while (remaining > 0 && await ProcessNextAsync()) // continue until EOS or budget is reached
{
}

Console.WriteLine();
}
catch (IOException ex)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine($"Could not load media '{imagePath}': {ex.Message}");
Console.ResetColor();
}
catch (RuntimeError ex)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.WriteLine($"MTMD processing failed: {ex.Message}");
Console.ResetColor();
}
}
}
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using LLama.Common;
using Spectre.Console;
Expand All @@ -6,27 +8,32 @@

namespace LLama.Examples.Examples
{
// This example shows how to chat with LLaVA model with both image and text as input.
// This example shows how to chat with Mtmd model with both image and text as input.
// It uses the interactive executor to inference.
public class LlavaInteractiveModeExecute
public class MtmdInteractiveModeExecute
{
public static async Task Run()
{
string multiModalProj = UserSettings.GetMMProjPath();
string modelPath = UserSettings.GetModelPath();
string modelImage = UserSettings.GetImagePath();
const int maxTokens = 1024;
const int maxTokens = 2048;

var prompt = $"{{{modelImage}}}\nUSER:\nProvide a full description of the image.\nASSISTANT:\n";

var parameters = new ModelParams(modelPath);

var mtmdParameters = MtmdContextParams.Default();
mtmdParameters.UseGpu = false;

using var model = await LLamaWeights.LoadFromFileAsync(parameters);
using var context = model.CreateContext(parameters);

// Llava Init
using var clipModel = await LLavaWeights.LoadFromFileAsync(multiModalProj);


// Mtmd Init
using var clipModel = await SafeMtmdWeights.LoadFromFileAsync(multiModalProj, model, mtmdParameters );

var mediaMarker = mtmdParameters.MediaMarker ?? NativeApi.MtmdDefaultMarker() ?? "<media>";

var ex = new InteractiveExecutor(context, clipModel);

Console.ForegroundColor = ConsoleColor.Yellow;
Expand All @@ -40,38 +47,61 @@ public static async Task Run()
Temperature = 0.1f
},

AntiPrompts = new List<string> { "\nUSER:" },
AntiPrompts = new List<string> { "\nASSISTANT:" },
MaxTokens = maxTokens

};

do
{

// Evaluate if we have images
// Evaluate if we have media
//
var imageMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var imageCount = imageMatches.Count();
var hasImages = imageCount > 0;
var mediaMatches = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var mediaCount = mediaMatches.Count();
var hasMedia = mediaCount > 0;

if (hasImages)
if (hasMedia)
{
var imagePathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var imagePaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();
var mediaPathsWithCurlyBraces = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Value);
var mediaPaths = Regex.Matches(prompt, "{([^}]*)}").Select(m => m.Groups[1].Value).ToList();

List<byte[]> imageBytes;
var embeds = new List<SafeMtmdEmbed>();
var imageList = new List<byte[]>();
var imageExtensions = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
".png",
".jpg",
".jpeg",
".bmp",
".gif",
".webp"
};

try
{
imageBytes = imagePaths.Select(File.ReadAllBytes).ToList();
foreach (var mediaPath in mediaPaths)
{
var extension = Path.GetExtension(mediaPath);
if (!string.IsNullOrEmpty(extension) && imageExtensions.Contains(extension))
{
// Keep the raw image data so the caller can reuse or inspect the images later.
imageList.Add(File.ReadAllBytes(mediaPath));
}

var embed = clipModel.LoadMedia(mediaPath);
embeds.Add(embed);
}
}
catch (IOException exception)
{
Console.ForegroundColor = ConsoleColor.Red;
Console.Write(
$"Could not load your {(imageCount == 1 ? "image" : "images")}:");
$"Could not load your {(mediaCount == 1 ? "media" : "medias")}:");
Console.Write($"{exception.Message}");
Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine("Please try again.");
clipModel.ClearMedia();
break;
}

Expand All @@ -81,19 +111,17 @@ public static async Task Run()
// https://github.com/ggerganov/llama.cpp/discussions/3620
ex.Context.NativeHandle.MemorySequenceRemove( LLamaSeqId.Zero, -1, -1 );

int index = 0;
foreach (var path in imagePathsWithCurlyBraces)
// Replace placeholders with media markers (one marker per image)
foreach (var path in mediaPathsWithCurlyBraces)
{
// First image replace to tag <image, the rest of the images delete the tag
prompt = prompt.Replace(path, index++ == 0 ? "<image>" : "");
prompt = prompt.Replace(path, mediaMarker, StringComparison.Ordinal);
}


Console.ForegroundColor = ConsoleColor.Yellow;
Console.WriteLine($"Here are the images, that are sent to the chat model in addition to your message.");
Console.WriteLine();

foreach (var consoleImage in imageBytes?.Select(bytes => new CanvasImage(bytes)) ?? Array.Empty<CanvasImage>())
foreach (var consoleImage in imageList.Select(image => new CanvasImage(image.ToArray())))
{
consoleImage.MaxWidth = 50;
AnsiConsole.Write(consoleImage);
Expand All @@ -108,10 +136,9 @@ public static async Task Run()

// Initialize Images in executor
//
foreach (var image in imagePaths)
{
ex.Images.Add(await File.ReadAllBytesAsync(image));
}
ex.Embeds.Clear();
foreach (var embed in embeds)
ex.Embeds.Add(embed);
}

Console.ForegroundColor = Color.White;
Expand Down
Loading