diff --git a/BotSharp.sln b/BotSharp.sln index e992d26ad..102137084 100644 --- a/BotSharp.sln +++ b/BotSharp.sln @@ -149,6 +149,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ExcelHandle EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.ImageHandler", "src\Plugins\BotSharp.Plugin.ImageHandler\BotSharp.Plugin.ImageHandler.csproj", "{242F2D93-FCCE-4982-8075-F3052ECCA92C}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BotSharp.Plugin.FuzzySharp", "src\Plugins\BotSharp.Plugin.FuzzySharp\BotSharp.Plugin.FuzzySharp.csproj", "{E7C243B9-E751-B3B4-8F16-95C76CA90D31}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -629,6 +631,14 @@ Global {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|Any CPU.Build.0 = Release|Any CPU {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.ActiveCfg = Release|Any CPU {242F2D93-FCCE-4982-8075-F3052ECCA92C}.Release|x64.Build.0 = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|Any CPU.Build.0 = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.ActiveCfg = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Debug|x64.Build.0 = Debug|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.ActiveCfg = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|Any CPU.Build.0 = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.ActiveCfg = Release|Any CPU + {E7C243B9-E751-B3B4-8F16-95C76CA90D31}.Release|x64.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -701,6 +711,7 @@ Global {0428DEAA-E4FE-4259-A6D8-6EDD1A9D0702} = {51AFE054-AE99-497D-A593-69BAEFB5106F} {FC63C875-E880-D8BB-B8B5-978AB7B62983} = {51AFE054-AE99-497D-A593-69BAEFB5106F} {242F2D93-FCCE-4982-8075-F3052ECCA92C} = {51AFE054-AE99-497D-A593-69BAEFB5106F} + {E7C243B9-E751-B3B4-8F16-95C76CA90D31} = {51AFE054-AE99-497D-A593-69BAEFB5106F} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {A9969D89-C98B-40A5-A12B-FC87E55B3A19} diff --git a/Directory.Packages.props b/Directory.Packages.props index 83b31d0e2..c0899132a 100644 --- a/Directory.Packages.props +++ b/Directory.Packages.props @@ -5,6 +5,8 @@ true + + @@ -18,6 +20,7 @@ + diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs new file mode 100644 index 000000000..71823515d --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseCollection.cs @@ -0,0 +1,7 @@ +namespace BotSharp.Abstraction.Knowledges; + +public interface IPhraseCollection +{ + Task>> LoadVocabularyAsync(); + Task> LoadDomainTermMappingAsync(); +} diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs new file mode 100644 index 000000000..6165f67da --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPhraseService.cs @@ -0,0 +1,12 @@ +namespace BotSharp.Abstraction.Knowledges; + +public interface IPhraseService +{ + /// + /// Search similar phrases in the collection + /// + /// + /// + /// + Task> SearchPhrasesAsync(string collection, string term); +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj b/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj new file mode 100644 index 000000000..ec9dfde32 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/BotSharp.Plugin.FuzzySharp.csproj @@ -0,0 +1,25 @@ + + + + $(TargetFramework) + enable + $(LangVersion) + $(BotSharpVersion) + $(GeneratePackageOnBuild) + $(GenerateDocumentationFile) + $(SolutionDir)packages + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs new file mode 100644 index 000000000..369d3fb9c --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/MatchReason.cs @@ -0,0 +1,21 @@ + +namespace BotSharp.Plugin.FuzzySharp.Constants +{ + public static class MatchReason + { + /// + /// Token matched a domain term mapping (e.g., HVAC -> Air Conditioning/Heating) + /// + public const string DomainTermMapping = "domain_term_mapping"; + + /// + /// Token exactly matched a vocabulary entry + /// + public const string ExactMatch = "exact_match"; + + /// + /// Token was flagged as a potential typo and a correction was suggested + /// + public const string TypoCorrection = "typo_correction"; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs new file mode 100644 index 000000000..8f160ae5f --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Constants/TextConstants.cs @@ -0,0 +1,30 @@ + +namespace BotSharp.Plugin.FuzzySharp.Constants +{ + public static class TextConstants + { + /// + /// Characters that need to be separated during tokenization (by adding spaces before and after) + /// Includes: parentheses, brackets, braces, punctuation marks, special symbols, etc. + /// This ensures "(IH)" is split into "(", "IH", ")" + /// + public static readonly char[] SeparatorChars = + { + // Parentheses and brackets + '(', ')', '[', ']', '{', '}', + // Punctuation marks + ',', '.', ';', ':', '!', '?', + // Special symbols + '=', '@', '#', '$', '%', '^', '&', '*', '+', '-', '\\', '|', '<', '>', '~', '`' + }; + + /// + /// Whitespace characters used as token separators during tokenization. + /// Includes: space, tab, newline, and carriage return. + /// + public static readonly char[] TokenSeparators = + { + ' ', '\t', '\n', '\r' + }; + } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs new file mode 100644 index 000000000..dc18c73d7 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Controllers/FuzzySharpController.cs @@ -0,0 +1,61 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.FuzzSharp.Arguments; +using BotSharp.Abstraction.FuzzSharp.Models; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; +using Microsoft.Extensions.Logging; + +namespace BotSharp.Plugin.FuzzySharp.Controllers +{ + [ApiController] + public class FuzzySharpController : ControllerBase + { + private readonly ITextAnalysisService _textAnalysisService; + private readonly ILogger _logger; + + public FuzzySharpController( + ITextAnalysisService textAnalysisService, + ILogger logger) + { + _textAnalysisService = textAnalysisService; + _logger = logger; + } + + /// + /// Analyze text for typos and entities using domain-specific vocabulary. + /// + /// Returns: + /// - `original`: Original input text + /// - `tokens`: Tokenized text (only included if `include_tokens=true`) + /// - `flagged`: List of flagged items (each with `match_type`): + /// - `domain_term_mapping` - Business abbreviations (confidence=1.0) + /// - `exact_match` - Exact vocabulary matches (confidence=1.0) + /// - `typo_correction` - Spelling corrections (confidence less than 1.0) + /// - `processing_time_ms`: Processing time in milliseconds + /// + /// Text analysis request + /// Text analysis response + [HttpPost("fuzzy-sharp/analyze-text")] + [ProducesResponseType(typeof(TextAnalysisResponse), StatusCodes.Status200OK)] + [ProducesResponseType(StatusCodes.Status400BadRequest)] + [ProducesResponseType(StatusCodes.Status500InternalServerError)] + public async Task AnalyzeText([FromBody] TextAnalysisRequest request) + { + try + { + if (string.IsNullOrWhiteSpace(request.Text)) + { + return BadRequest(new { error = "Text is required" }); + } + + var result = await _textAnalysisService.AnalyzeTextAsync(request); + return Ok(result); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error analyzing text"); + return StatusCode(500, new { error = $"Error analyzing text: {ex.Message}" }); + } + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs new file mode 100644 index 000000000..79fbd9894 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Arguments/TextAnalysisRequest.cs @@ -0,0 +1,53 @@ +using System.ComponentModel.DataAnnotations; +using System.Text.Json.Serialization; + +namespace BotSharp.Abstraction.FuzzSharp.Arguments; + +public class TextAnalysisRequest +{ + /// + /// Text to analyze + /// + [Required] + [JsonPropertyName("text")] + public string Text { get; set; } = string.Empty; + + /// + /// Folder path containing CSV files (will read all .csv files from the folder or its 'output' subfolder) + /// + [JsonPropertyName("vocabulary_folder_name")] + public string? VocabularyFolderName { get; set; } + + /// + /// Domain term mapping CSV file + /// + [JsonPropertyName("domain_term_mapping_file")] + public string? DomainTermMappingFile { get; set; } + + /// + /// Min score for suggestions (0.0-1.0) + /// + [JsonPropertyName("cutoff")] + [Range(0.0, 1.0)] + public double Cutoff { get; set; } = 0.82; + + /// + /// Max candidates per domain (1-20) + /// + [JsonPropertyName("topk")] + [Range(1, 20)] + public int TopK { get; set; } = 5; + + /// + /// Max n-gram size (1-10) + /// + [JsonPropertyName("max_ngram")] + [Range(1, 10)] + public int MaxNgram { get; set; } = 5; + + /// + /// Include tokens field in response (default: False) + /// + [JsonPropertyName("include_tokens")] + public bool IncludeTokens { get; set; } = false; +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs new file mode 100644 index 000000000..c2d91b0e4 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/INgramProcessor.cs @@ -0,0 +1,27 @@ +using BotSharp.Abstraction.FuzzSharp.Models; + +namespace BotSharp.Abstraction.FuzzSharp +{ + public interface INgramProcessor + { + /// + /// Process tokens and generate all possible n-gram match results + /// + /// List of tokens to process + /// Vocabulary (domain type -> vocabulary set) + /// Domain term mapping + /// Lookup table (lowercase vocabulary -> (canonical form, domain type list)) + /// Maximum n-gram length + /// Minimum confidence threshold for fuzzy matching + /// Maximum number of matches to return + /// List of flagged items + List ProcessNgrams( + List tokens, + Dictionary> vocabulary, + Dictionary domainTermMapping, + Dictionary DomainTypes)> lookup, + int maxNgram, + double cutoff, + int topK); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs new file mode 100644 index 000000000..b406f9348 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/IResultProcessor.cs @@ -0,0 +1,18 @@ +using BotSharp.Abstraction.FuzzSharp.Models; + +namespace BotSharp.Abstraction.FuzzSharp +{ + /// + /// Result processor interface + /// Responsible for processing match results, including deduplication and sorting + /// + public interface IResultProcessor + { + /// + /// Process a list of flagged items, removing overlapping duplicates and sorting + /// + /// List of flagged items to process + /// Processed list of flagged items (deduplicated and sorted) + List ProcessResults(List flagged); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITextAnalysisService.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITextAnalysisService.cs new file mode 100644 index 000000000..4add4f62b --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITextAnalysisService.cs @@ -0,0 +1,13 @@ +using BotSharp.Abstraction.FuzzSharp.Arguments; +using BotSharp.Abstraction.FuzzSharp.Models; + +namespace BotSharp.Abstraction.FuzzSharp +{ + public interface ITextAnalysisService + { + /// + /// Analyze text for typos and entities using domain-specific vocabulary + /// + Task AnalyzeTextAsync(TextAnalysisRequest request); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs new file mode 100644 index 000000000..5e0b04ac5 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/ITokenMatcher.cs @@ -0,0 +1,40 @@ +namespace BotSharp.Abstraction.FuzzSharp +{ + public interface ITokenMatcher + { + /// + /// Try to match a content span and return a match result + /// + /// The matching context containing all necessary information + /// Match result if found, null otherwise + MatchResult? TryMatch(MatchContext context); + + /// + /// Priority of this matcher (higher priority matchers are tried first) + /// + int Priority { get; } + } + + /// + /// Context information for token matching + /// + public record MatchContext( + string ContentSpan, + string ContentLow, + int StartIndex, + int NgramLength, + Dictionary> Vocabulary, + Dictionary DomainTermMapping, + Dictionary DomainTypes)> Lookup, + double Cutoff, + int TopK); + + /// + /// Result of a token match + /// + public record MatchResult( + string CanonicalForm, + List DomainTypes, + string MatchType, + double Confidence); +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs new file mode 100644 index 000000000..8dc547d48 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/FlaggedItem.cs @@ -0,0 +1,51 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Abstraction.FuzzSharp.Models +{ + public class FlaggedItem + { + /// + /// Token index in the original text + /// + [JsonPropertyName("index")] + public int Index { get; set; } + + /// + /// Original token text + /// + [JsonPropertyName("token")] + public string Token { get; set; } = string.Empty; + + /// + /// Domain types where this token was found (e.g., ['client_Profile.Name', 'data_ServiceType.Name']) + /// + [JsonPropertyName("domain_types")] + public List DomainTypes { get; set; } = new(); + + /// + /// Type of match: 'domain_term_mapping' (business abbreviations, confidence=1.0) | + /// 'exact_match' (vocabulary matches, confidence=1.0) | + /// 'typo_correction' (spelling corrections, confidence less than 1.0) + /// + [JsonPropertyName("match_type")] + public string MatchType { get; set; } = string.Empty; + + /// + /// Canonical form or suggested correction + /// + [JsonPropertyName("canonical_form")] + public string CanonicalForm { get; set; } = string.Empty; + + /// + /// Confidence score (0.0-1.0, where 1.0 is exact match) + /// + [JsonPropertyName("confidence")] + public double Confidence { get; set; } + + /// + /// N-gram length (number of tokens in this match). Internal field, not included in JSON output. + /// + [JsonIgnore] + public int NgramLength { get; set; } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs new file mode 100644 index 000000000..131a53b49 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzSharp/Models/TextAnalysisResponse.cs @@ -0,0 +1,31 @@ +using System.Text.Json.Serialization; + +namespace BotSharp.Abstraction.FuzzSharp.Models; + +public class TextAnalysisResponse +{ + /// + /// Original text + /// + [JsonPropertyName("original")] + public string Original { get; set; } = string.Empty; + + /// + /// Tokenized text (only included if include_tokens=true) + /// + [JsonPropertyName("tokens")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public List? Tokens { get; set; } + + /// + /// Flagged items (filter by 'match_type' field: 'domain_term_mapping', 'exact_match', or 'typo_correction') + /// + [JsonPropertyName("flagged")] + public List Flagged { get; set; } = new(); + + /// + /// Processing time in milliseconds + /// + [JsonPropertyName("processing_time_ms")] + public double ProcessingTimeMs { get; set; } +} \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs new file mode 100644 index 000000000..412ddfa9c --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/FuzzySharpPlugin.cs @@ -0,0 +1,29 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Abstraction.Plugins; +using BotSharp.Plugin.FuzzySharp.Services; +using BotSharp.Plugin.FuzzySharp.Services.Matching; +using BotSharp.Plugin.FuzzySharp.Services.Processors; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; + +namespace BotSharp.Plugin.FuzzySharp; + +public class FuzzySharpPlugin : IBotSharpPlugin +{ + public string Id => "379e6f7b-c58c-458b-b8cd-0374e5830711"; + public string Name => "Fuzzy Sharp"; + public string Description => "Analyze text for typos and entities using domain-specific vocabulary."; + public string IconUrl => "https://cdn-icons-png.flaticon.com/512/9592/9592995.png"; + + public void RegisterDI(IServiceCollection services, IConfiguration config) + { + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs new file mode 100644 index 000000000..eb6d4243b --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/CsvPhraseCollectionLoader.cs @@ -0,0 +1,188 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Core.Infrastructures; +using CsvHelper; +using CsvHelper.Configuration; +using Microsoft.Extensions.Logging; +using System.Globalization; +using System.IO; + +namespace BotSharp.Plugin.FuzzySharp.Services; + +public class CsvPhraseCollectionLoader : IPhraseCollection +{ + private readonly ILogger _logger; + + public CsvPhraseCollectionLoader(ILogger logger) + { + _logger = logger; + } + + [SharpCache(60)] + public async Task>> LoadVocabularyAsync() + { + string foldername = ""; + var vocabulary = new Dictionary>(); + + if (string.IsNullOrEmpty(foldername)) + { + return vocabulary; + } + + // Load CSV files from the folder + var csvFileDict = await LoadCsvFilesFromFolderAsync(foldername); + if (csvFileDict.Count == 0) + { + return vocabulary; + } + + // Load each CSV file + foreach (var (domainType, filePath) in csvFileDict) + { + try + { + var terms = await LoadCsvFileAsync(filePath); + vocabulary[domainType] = terms; + _logger.LogInformation($"Loaded {terms.Count} terms for domain type '{domainType}' from {filePath}"); + } + catch (Exception ex) + { + _logger.LogError(ex, $"Error loading CSV file for domain type '{domainType}': {filePath}"); + } + } + + return vocabulary; + } + + [SharpCache(60)] + public async Task> LoadDomainTermMappingAsync() + { + string filename = ""; + var result = new Dictionary(); + if (string.IsNullOrWhiteSpace(filename)) + { + return result; + } + + var searchFolder = Path.Combine(AppContext.BaseDirectory, "data", "plugins", "fuzzySharp"); + var filePath = Path.Combine(searchFolder, filename); + + if (string.IsNullOrEmpty(filePath) || !File.Exists(filePath)) + { + return result; + } + + try + { + using var reader = new StreamReader(filePath); + using var csv = new CsvReader(reader, CreateCsvConfig()); + + await csv.ReadAsync(); + csv.ReadHeader(); + + if (!HasRequiredColumns(csv)) + { + _logger.LogWarning("Domain term mapping file missing required columns: {FilePath}", filePath); + return result; + } + + while (await csv.ReadAsync()) + { + var term = csv.GetField("term") ?? string.Empty; + var dbPath = csv.GetField("dbPath") ?? string.Empty; + var canonicalForm = csv.GetField("canonical_form") ?? string.Empty; + + if (term.Length == 0 || dbPath.Length == 0 || canonicalForm.Length == 0) + { + _logger.LogWarning( + "Missing column(s) in CSV at row {Row}: term={Term}, dbPath={DbPath}, canonical_form={CanonicalForm}", + csv.Parser.RawRow, + term ?? "", + dbPath ?? "", + canonicalForm ?? ""); + continue; + } + + var key = term.ToLowerInvariant(); + result[key] = (dbPath, canonicalForm); + } + + _logger.LogInformation("Loaded domain term mapping from {FilePath}: {Count} terms", filePath, result.Count); + } + catch (Exception ex) + { + _logger.LogError(ex, "Error loading domain term mapping file: {FilePath}", filePath); + } + + return result; + } + + private async Task> LoadCsvFileAsync(string filePath) + { + var terms = new HashSet(StringComparer.OrdinalIgnoreCase); + + if (!File.Exists(filePath)) + { + _logger.LogWarning($"CSV file does not exist: {filePath}"); + return terms; + } + + using var reader = new StreamReader(filePath); + using var csv = new CsvReader(reader, new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = false // No header in the CSV files + }); + + while (await csv.ReadAsync()) + { + // Read the first column (assuming it contains the terms) + var term = csv.GetField(0); + if (!string.IsNullOrWhiteSpace(term)) + { + terms.Add(term.Trim()); + } + } + + _logger.LogInformation($"Loaded {terms.Count} terms from {Path.GetFileName(filePath)}"); + return terms; + } + + private async Task> LoadCsvFilesFromFolderAsync(string folderName) + { + var csvFileDict = new Dictionary(); + var searchFolder = Path.Combine(AppContext.BaseDirectory, "data", "plugins", "fuzzySharp", folderName); + if (!Directory.Exists(searchFolder)) + { + _logger.LogWarning($"Folder does not exist: {searchFolder}"); + return csvFileDict; + } + + var csvFiles = Directory.GetFiles(searchFolder, "*.csv"); + foreach (var file in csvFiles) + { + var fileName = Path.GetFileNameWithoutExtension(file); + csvFileDict[fileName] = file; + } + + _logger.LogInformation($"Loaded {csvFileDict.Count} CSV files from {searchFolder}"); + return await Task.FromResult(csvFileDict); + } + + private static CsvConfiguration CreateCsvConfig() + { + return new CsvConfiguration(CultureInfo.InvariantCulture) + { + HasHeaderRecord = true, + DetectColumnCountChanges = true, + MissingFieldFound = null + }; + } + + private static bool HasRequiredColumns(CsvReader csv) + { + return csv.HeaderRecord is { Length: > 0 } headers + && headers.Contains("term") + && headers.Contains("dbPath") + && headers.Contains("canonical_form"); + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/DomainTermMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/DomainTermMatcher.cs new file mode 100644 index 000000000..e8813013d --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/DomainTermMatcher.cs @@ -0,0 +1,24 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching +{ + public class DomainTermMatcher : ITokenMatcher + { + public int Priority => 3; // Highest priority + + public MatchResult? TryMatch(MatchContext context) + { + if (context.DomainTermMapping.TryGetValue(context.ContentLow, out var match)) + { + return new MatchResult( + CanonicalForm: match.CanonicalForm, + DomainTypes: new List { match.DbPath }, + MatchType: MatchReason.DomainTermMapping, + Confidence: 1.0); + } + + return null; + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs new file mode 100644 index 000000000..f404f47b8 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/ExactMatcher.cs @@ -0,0 +1,24 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching +{ + public class ExactMatcher : ITokenMatcher + { + public int Priority => 2; // Second highest priority + + public MatchResult? TryMatch(MatchContext context) + { + if (context.Lookup.TryGetValue(context.ContentLow, out var match)) + { + return new MatchResult( + CanonicalForm: match.CanonicalForm, + DomainTypes: match.DomainTypes, + MatchType: MatchReason.ExactMatch, + Confidence: 1.0); + } + + return null; + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs new file mode 100644 index 000000000..c6b3ba477 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Matching/FuzzyMatcher.cs @@ -0,0 +1,82 @@ +using BotSharp.Abstraction.FuzzSharp; +using System.Text.RegularExpressions; +using FuzzySharp; +using FuzzySharp.SimilarityRatio; +using FuzzySharp.SimilarityRatio.Scorer.StrategySensitive; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Matching +{ + public class FuzzyMatcher : ITokenMatcher + { + public int Priority => 1; // Lowest priority + + public MatchResult? TryMatch(MatchContext context) + { + var match = CheckTypoCorrection(context.ContentSpan, context.Lookup, context.Cutoff); + if (match == null) + { + return null; + } + + var (canonicalForm, domainTypes, confidence) = match.Value; + return new MatchResult( + CanonicalForm: canonicalForm, + DomainTypes: domainTypes, + MatchType: MatchReason.TypoCorrection, + Confidence: confidence); + } + + /// + /// Check typo correction using fuzzy matching + /// + private (string CanonicalForm, List DomainTypes, double Confidence)? CheckTypoCorrection( + string contentSpan, + Dictionary DomainTypes)> lookup, + double cutoff) + { + // Convert cutoff to 0-100 scale for FuzzySharp + var scoreCutoff = (int)(cutoff * 100); + + // Get all candidates from lookup + var candidates = lookup.Keys.ToList(); + + // Find best match using ExtractOne + var scorer = ScorerCache.Get(); + var result = Process.ExtractOne( + contentSpan, + candidates, + candidate => Normalize(candidate), // Preprocessor function + scorer, + scoreCutoff // Score cutoff + ); + + if (result == null) + { + return null; + } + + // Get the canonical form and domain types from lookup + var match = lookup[result.Value]; + return (match.CanonicalForm, match.DomainTypes, Math.Round(result.Score / 100.0, 3)); + } + + /// + /// Normalize text for fuzzy matching comparison + /// - Replaces all non-word characters (except apostrophes) with spaces + /// - Converts to lowercase + /// - Collapses multiple spaces into single space + /// - Trims leading/trailing whitespace + /// Example: "Test-Value (123)" → "test value 123" + /// + /// Text to normalize + /// Normalized text suitable for fuzzy matching + private string Normalize(string text) + { + // Replace non-word characters (except apostrophes) with spaces + var normalized = Regex.Replace(text, @"[^\w']+", " ", RegexOptions.IgnoreCase); + // Convert to lowercase, collapse multiple spaces, and trim + return Regex.Replace(normalized.ToLowerInvariant(), @"\s+", " ").Trim(); + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs new file mode 100644 index 000000000..d28829a16 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/NgramProcessor.cs @@ -0,0 +1,134 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.FuzzSharp.Models; +using BotSharp.Plugin.FuzzySharp.Constants; +using BotSharp.Plugin.FuzzySharp.Utils; + +namespace BotSharp.Plugin.FuzzySharp.Services.Processors +{ + public class NgramProcessor : INgramProcessor + { + private readonly List _matchers; + + public NgramProcessor(IEnumerable matchers) + { + // Sort matchers by priority (highest first) + _matchers = matchers.OrderByDescending(m => m.Priority).ToList(); + } + + public List ProcessNgrams( + List tokens, + Dictionary> vocabulary, + Dictionary domainTermMapping, + Dictionary DomainTypes)> lookup, + int maxNgram, + double cutoff, + int topK) + { + var flagged = new List(); + + // Process n-grams from largest to smallest + for (int n = maxNgram; n >= 1; n--) + { + for (int i = 0; i <= tokens.Count - n; i++) + { + var item = ProcessSingleNgram( + tokens, + i, + n, + vocabulary, + domainTermMapping, + lookup, + cutoff, + topK); + + if (item != null) + { + flagged.Add(item); + } + } + } + + return flagged; + } + + /// + /// Process a single n-gram at the specified position + /// + private FlaggedItem? ProcessSingleNgram( + List tokens, + int startIdx, + int n, + Dictionary> vocabulary, + Dictionary domainTermMapping, + Dictionary DomainTypes)> lookup, + double cutoff, + int topK) + { + // Extract content span + var (contentSpan, spanTokens, contentIndices) = ExtractContentSpan(tokens, startIdx, n); + if (string.IsNullOrWhiteSpace(contentSpan)) + { + return null; + } + + var contentLow = contentSpan.ToLowerInvariant(); + + // Try matching in priority order using matchers + var context = new MatchContext( + contentSpan, + contentLow, + startIdx, + n, + vocabulary, + domainTermMapping, + lookup, + cutoff, + topK); + + foreach (var matcher in _matchers) + { + var matchResult = matcher.TryMatch(context); + if (matchResult != null) + { + return CreateFlaggedItem(matchResult, startIdx, contentSpan, n); + } + } + + return null; + } + + /// + /// Create a FlaggedItem from a MatchResult + /// + private FlaggedItem CreateFlaggedItem( + MatchResult matchResult, + int startIndex, + string contentSpan, + int ngramLength) + { + return new FlaggedItem + { + Index = startIndex, + Token = contentSpan, + DomainTypes = matchResult.DomainTypes, + MatchType = matchResult.MatchType, + CanonicalForm = matchResult.CanonicalForm, + Confidence = matchResult.Confidence, + NgramLength = ngramLength + }; + } + + /// + /// Extract content span + /// + private (string ContentSpan, List Tokens, List ContentIndices) ExtractContentSpan( + List tokens, + int startIdx, + int n) + { + var span = tokens.Skip(startIdx).Take(n).ToList(); + var indices = Enumerable.Range(startIdx, n).ToList(); + return (string.Join(" ", span), span, indices); + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs new file mode 100644 index 000000000..2238b6153 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/Processors/ResultProcessor.cs @@ -0,0 +1,103 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.FuzzSharp.Models; +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Services.Processors +{ + public class ResultProcessor : IResultProcessor + { + public List ProcessResults(List flagged) + { + // Remove overlapping duplicates + var deduped = RemoveOverlappingDuplicates(flagged); + + // Sort by confidence (descending), then match_type (alphabetically) + // This matches Python's _sort_and_format_results function + return deduped + .OrderByDescending(f => f.Confidence) + .ThenBy(f => f.MatchType) + .ToList(); + } + + /// + /// Remove overlapping detections with the same canonical form. + /// When multiple detections overlap and have the same canonical_form, + /// keep only the best one based on: + /// 1. Prefer domain_term_mapping over exact_match over typo_correction (matches matcher priority) + /// 2. Highest confidence + /// 3. Shortest n-gram length + /// + private List RemoveOverlappingDuplicates(List flagged) + { + var deduped = new List(); + var skipIndices = new HashSet(); + + for (int i = 0; i < flagged.Count; i++) + { + if (skipIndices.Contains(i)) + { + continue; + } + + var item = flagged[i]; + var itemRange = (item.Index, item.Index + item.NgramLength); + + // Find all overlapping items with same canonical_form (regardless of match_type) + var overlappingGroup = new List { item }; + for (int j = i + 1; j < flagged.Count; j++) + { + if (skipIndices.Contains(j)) + { + continue; + } + + var other = flagged[j]; + if (item.CanonicalForm == other.CanonicalForm) + { + var otherRange = (other.Index, other.Index + other.NgramLength); + if (RangesOverlap(itemRange, otherRange)) + { + overlappingGroup.Add(other); + skipIndices.Add(j); + } + } + } + + // Keep the best item from the overlapping group + // Priority: domain_term_mapping (3) > exact_match (2) > typo_correction (1) + // Then highest confidence, then shortest ngram + var bestItem = overlappingGroup + .OrderByDescending(x => GetMatchTypePriority(x.MatchType)) + .ThenByDescending(x => x.Confidence) + .ThenBy(x => x.NgramLength) + .First(); + deduped.Add(bestItem); + } + + return deduped; + } + + /// + /// Get priority value for match type (higher is better) + /// Matches the priority order in matchers: domain > exact > fuzzy + /// + private int GetMatchTypePriority(string matchType) + { + return matchType switch + { + MatchReason.DomainTermMapping => 3, // Highest priority + MatchReason.ExactMatch => 2, // Second priority + MatchReason.TypoCorrection => 1, // Lowest priority + _ => 0 // Unknown types get lowest priority + }; + } + + /// + /// Check if two token ranges overlap. + /// + private bool RangesOverlap((int start, int end) range1, (int start, int end) range2) + { + return range1.start < range2.end && range2.start < range1.end; + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/TextAnalysisService.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/TextAnalysisService.cs new file mode 100644 index 000000000..969ff1a65 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Services/TextAnalysisService.cs @@ -0,0 +1,173 @@ +using BotSharp.Abstraction.FuzzSharp; +using BotSharp.Abstraction.FuzzSharp.Arguments; +using BotSharp.Abstraction.FuzzSharp.Models; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Plugin.FuzzySharp.Utils; +using Microsoft.Extensions.Logging; +using System.Diagnostics; + +namespace BotSharp.Plugin.FuzzySharp.Services; + +public class TextAnalysisService : ITextAnalysisService +{ + private readonly ILogger _logger; + private readonly IEnumerable _phraseLoaderServices; + private readonly INgramProcessor _ngramProcessor; + private readonly IResultProcessor _resultProcessor; + + public TextAnalysisService( + ILogger logger, + IEnumerable phraseLoaderServices, + INgramProcessor ngramProcessor, + IResultProcessor resultProcessor) + { + _logger = logger; + _phraseLoaderServices = phraseLoaderServices; + _ngramProcessor = ngramProcessor; + _resultProcessor = resultProcessor; + } + + /// + /// Analyze text for typos and entities using domain-specific vocabulary + /// + public async Task AnalyzeTextAsync(TextAnalysisRequest request) + { + var stopwatch = Stopwatch.StartNew(); + try + { + // Tokenize the text + var tokens = TextTokenizer.Tokenize(request.Text); + + // Load vocabulary + var vocabulary = await LoadAllVocabularyAsync(); + + // Load domain term mapping + var domainTermMapping = await LoadAllDomainTermMappingAsync(); + + // Analyze text + var flagged = AnalyzeTokens(tokens, vocabulary, domainTermMapping, request); + + stopwatch.Stop(); + + var response = new TextAnalysisResponse + { + Original = request.Text, + Flagged = flagged, + ProcessingTimeMs = Math.Round(stopwatch.Elapsed.TotalMilliseconds, 2) + }; + + if (request.IncludeTokens) + { + response.Tokens = tokens; + } + + _logger.LogInformation( + $"Text analysis completed in {response.ProcessingTimeMs}ms | " + + $"Text length: {request.Text.Length} chars | " + + $"Flagged items: {flagged.Count}"); + + return response; + } + catch (Exception ex) + { + stopwatch.Stop(); + _logger.LogError(ex, $"Error analyzing text after {stopwatch.Elapsed.TotalMilliseconds}ms"); + throw; + } + } + + public async Task>> LoadAllVocabularyAsync() + { + var results = await Task.WhenAll(_phraseLoaderServices.Select(c => c.LoadVocabularyAsync())); + var merged = new Dictionary>(); + + foreach (var dict in results) + { + foreach (var kvp in dict) + { + if (!merged.TryGetValue(kvp.Key, out var set)) + merged[kvp.Key] = new HashSet(kvp.Value); + else + set.UnionWith(kvp.Value); + } + } + + return merged; + } + + public async Task> LoadAllDomainTermMappingAsync() + { + var results = await Task.WhenAll(_phraseLoaderServices.Select(c => c.LoadDomainTermMappingAsync())); + var merged = new Dictionary(); + + foreach (var dict in results) + { + foreach (var kvp in dict) + merged[kvp.Key] = kvp.Value; // later entries override earlier ones + } + + return merged; + } + + /// + /// Analyze tokens for typos and entities + /// + private List AnalyzeTokens( + List tokens, + Dictionary> vocabulary, + Dictionary domainTermMapping, + TextAnalysisRequest request) + { + // Build lookup table for O(1) exact match lookups (matching Python's build_lookup) + var lookup = BuildLookup(vocabulary); + + // Process n-grams and find matches + var flagged = _ngramProcessor.ProcessNgrams( + tokens, + vocabulary, + domainTermMapping, + lookup, + request.MaxNgram, + request.Cutoff, + request.TopK); + + // Process results: deduplicate and sort + return _resultProcessor.ProcessResults(flagged); + } + + /// + /// Build a lookup dictionary mapping lowercase terms to their canonical form and domain types. + /// This is a performance optimization - instead of iterating through all domains for each lookup, + /// we build a flat dictionary once at the start. + /// + /// Matches Python's build_lookup() function. + /// + private Dictionary DomainTypes)> BuildLookup( + Dictionary> vocabulary) + { + var lookup = new Dictionary DomainTypes)>(); + + foreach (var (domainType, terms) in vocabulary) + { + foreach (var term in terms) + { + var key = term.ToLowerInvariant(); + if (lookup.TryGetValue(key, out var existing)) + { + // Term already exists - add this domain type to the list if not already there + if (!existing.DomainTypes.Contains(domainType)) + { + existing.DomainTypes.Add(domainType); + } + } + else + { + // New term - create entry with single type in list + lookup[key] = (term, new List { domainType }); + } + } + } + + return lookup; + } +} diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs new file mode 100644 index 000000000..568fe81d5 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Using.cs @@ -0,0 +1,10 @@ +global using System; +global using System.Collections.Generic; +global using System.Linq; +global using System.Net.Http; +global using System.Net.Mime; +global using System.Text; +global using System.Text.Json; +global using System.Threading; +global using System.Threading.Tasks; + diff --git a/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs b/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs new file mode 100644 index 000000000..2ccb6ba2f --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.FuzzySharp/Utils/TextTokenizer.cs @@ -0,0 +1,64 @@ +using BotSharp.Plugin.FuzzySharp.Constants; + +namespace BotSharp.Plugin.FuzzySharp.Utils +{ + public static class TextTokenizer + { + /// + /// Preprocess text: add spaces before and after characters that need to be separated + /// This allows subsequent simple whitespace tokenization to correctly separate these characters + /// Example: "(IH)" -> " ( IH ) " -> ["(", "IH", ")"] + /// + /// Text to preprocess + /// Preprocessed text + public static string PreprocessText(string text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return text; + } + + var result = new StringBuilder(text.Length * 2); + + foreach (var ch in text) + { + // If it's a character that needs to be separated, add spaces before and after + if (TextConstants.SeparatorChars.Contains(ch)) + { + result.Append(' '); + result.Append(ch); + result.Append(' '); + } + else + { + result.Append(ch); + } + } + + return result.ToString(); + } + + /// + /// Simple whitespace tokenization + /// Should be called after preprocessing text with PreprocessText + /// + /// Text to tokenize + /// List of tokens + public static List SimpleTokenize(string text) + { + return text.Split(TextConstants.TokenSeparators, StringSplitOptions.RemoveEmptyEntries).ToList(); + } + + /// + /// Complete tokenization flow: preprocessing + tokenization + /// This is the recommended usage + /// + /// Text to tokenize + /// List of tokens + public static List Tokenize(string text) + { + var preprocessed = PreprocessText(text); + return SimpleTokenize(preprocessed); + } + } +} diff --git a/src/WebStarter/WebStarter.csproj b/src/WebStarter/WebStarter.csproj index 5a7c6eb7b..082ac578e 100644 --- a/src/WebStarter/WebStarter.csproj +++ b/src/WebStarter/WebStarter.csproj @@ -37,6 +37,7 @@ + diff --git a/src/WebStarter/appsettings.json b/src/WebStarter/appsettings.json index 57dd1c50d..2ca33f390 100644 --- a/src/WebStarter/appsettings.json +++ b/src/WebStarter/appsettings.json @@ -846,7 +846,8 @@ "BotSharp.Plugin.ExcelHandler", "BotSharp.Plugin.SqlDriver", "BotSharp.Plugin.TencentCos", - "BotSharp.Plugin.PythonInterpreter" + "BotSharp.Plugin.PythonInterpreter", + "BotSharp.Plugin.FuzzySharp" ] } }