diff --git a/TensorStack.Audio.Windows/AudioInput.cs b/TensorStack.Audio.Windows/AudioInput.cs
index 98c4ed6..776347f 100644
--- a/TensorStack.Audio.Windows/AudioInput.cs
+++ b/TensorStack.Audio.Windows/AudioInput.cs
@@ -12,7 +12,7 @@ namespace TensorStack.Audio
///
public class AudioInput : AudioInputBase
{
- private readonly string _sourceFile;
+ private string _sourceFile;
///
/// Initializes a new instance of the class.
@@ -21,6 +21,13 @@ public class AudioInput : AudioInputBase
public AudioInput(string filename, string audioCodec = "pcm_s16le", int sampleRate = 16000, int channels = 1)
: this(filename, AudioManager.LoadTensor(filename, audioCodec, sampleRate, channels)) { }
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The audio tensor.
+ public AudioInput(AudioTensor audioTensor)
+ : base(audioTensor) { }
+
///
/// Initializes a new instance of the class.
///
@@ -44,6 +51,9 @@ protected AudioInput(string filename, AudioTensor audioTensor)
/// The filename.
public override void Save(string filename)
{
+ if (string.IsNullOrEmpty(_sourceFile))
+ _sourceFile = filename;
+
AudioManager.SaveAudio(filename, this);
}
@@ -55,6 +65,9 @@ public override void Save(string filename)
/// The cancellation token.
public override async Task SaveAsync(string filename, CancellationToken cancellationToken = default)
{
+ if (string.IsNullOrEmpty(_sourceFile))
+ _sourceFile = filename;
+
await AudioManager.SaveAudioAync(filename, this, cancellationToken);
}
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/README.md b/TensorStack.TextGeneration/Pipelines/Supertonic/README.md
new file mode 100644
index 0000000..8344fbb
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/README.md
@@ -0,0 +1,19 @@
+# Supertonic TTS
+https://github.com/supertone-inc/supertonic
+
+
+```csharp
+// [model] https://huggingface.co/TensorStack/Supertonic-onnx
+
+var provider = Provider.GetProvider(GraphOptimizationLevel.ORT_ENABLE_ALL);
+var modelPath = "M:\\Models\\Supertonic-onnx";
+var pipeline = SupertonicPipeline.Create(modelPath, provider);
+var options = new SupertonicOptions
+{
+ TextInput = "On a quiet morning in the old town, a clockmaker named Ellis unlocked his tiny shop",
+ VoiceStyle = "Female1"
+};
+
+var generateResult = await pipeline.RunAsync(options);
+AudioManager.SaveAudio("Output.wav", generateResult);
+```
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicConfig.cs b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicConfig.cs
new file mode 100644
index 0000000..a3c668b
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicConfig.cs
@@ -0,0 +1,20 @@
+using TensorStack.Common;
+
+namespace TensorStack.TextGeneration.Pipelines.Supertonic
+{
+ public record SupertonicConfig
+ {
+ public int SampleRate { get; init; } = 44100;
+ public int BaseChunkSize { get; init; } = 512;
+ public int LatentDim { get; init; } = 24;
+ public int ChunkCompressFactor { get; init; } = 6;
+ public int TextEmbedSize { get; init; } = 256;
+ public int ScaleFactor { get; init; } = 3072;
+ public string IndexerPath { get; init; }
+ public string VoiceStylePath { get; init; }
+ public ModelConfig PredictorConfig { get; init; }
+ public ModelConfig EncoderConfig { get; init; }
+ public ModelConfig EstimatorConfig { get; init; }
+ public ModelConfig DecoderConfig { get; init; }
+ }
+}
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicOptions.cs b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicOptions.cs
new file mode 100644
index 0000000..df447c2
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicOptions.cs
@@ -0,0 +1,13 @@
+using TensorStack.Common.Pipeline;
+
+namespace TensorStack.TextGeneration.Pipelines.Supertonic
+{
+ public record SupertonicOptions : IRunOptions
+ {
+ public string TextInput { get; set; }
+ public string VoiceStyle { get; set; }
+ public int Steps { get; set; } = 5;
+ public float Speed { get; set; } = 1f;
+ public float SilenceDuration { get; set; } = 0.3f;
+ }
+}
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicPipeline.cs b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicPipeline.cs
new file mode 100644
index 0000000..909c44c
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicPipeline.cs
@@ -0,0 +1,304 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
+using TensorStack.Common;
+using TensorStack.Common.Pipeline;
+using TensorStack.Common.Tensor;
+using TensorStack.TextGeneration.Common;
+
+namespace TensorStack.TextGeneration.Pipelines.Supertonic
+{
+ ///
+ /// Supertonic TTS Pipeline.
+ ///
+ public class SupertonicPipeline : IPipeline
+ {
+ private readonly Random _random;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The configuration.
+ public SupertonicPipeline(SupertonicConfig configuration)
+ {
+ _random = new Random();
+ Configuration = configuration;
+ Processor = new SupertonicProcessor(configuration.IndexerPath, configuration.VoiceStylePath);
+ Prediction = new ModelSession(configuration.PredictorConfig);
+ Encoder = new ModelSession(configuration.EncoderConfig);
+ Estimator = new ModelSession(configuration.EstimatorConfig);
+ Decoder = new ModelSession(configuration.DecoderConfig);
+ }
+
+ public SupertonicConfig Configuration { get; init; }
+ public SupertonicProcessor Processor { get; init; }
+ public ModelSession Prediction { get; init; }
+ public ModelSession Encoder { get; init; }
+ public ModelSession Estimator { get; init; }
+ public ModelSession Decoder { get; init; }
+ public IEnumerable VoiceStyles => Processor.VoiceStyles;
+
+
+ ///
+ /// Loads the pipeline.
+ ///
+ /// The cancellation token.
+ public Task LoadAsync(CancellationToken cancellationToken = default)
+ {
+ // SupertonicPipeline pipelines are lazy loaded on first run
+ return Task.CompletedTask;
+ }
+
+
+ ///
+ /// Unloads the pipeline.
+ ///
+ /// The cancellation token.
+ public async Task UnloadAsync(CancellationToken cancellationToken = default)
+ {
+ await Task.WhenAll
+ (
+ Prediction.UnloadAsync(),
+ Encoder.UnloadAsync(),
+ Estimator.UnloadAsync(),
+ Decoder.UnloadAsync()
+ );
+ }
+
+
+ ///
+ /// Run as an asynchronous operation.
+ ///
+ /// The options.
+ /// The progress callback.
+ /// The cancellation token.
+ public async Task RunAsync(SupertonicOptions options, IProgress progressCallback = null, CancellationToken cancellationToken = default)
+ {
+ var totalDuration = 0.0f;
+ var audioBuffer = new List();
+ var voiceStyle = Processor.GetVoiceStyle(options.VoiceStyle);
+ var silenceLength = (int)(options.SilenceDuration * Configuration.SampleRate);
+ var silenceBuffer = new float[silenceLength];
+
+ // Process text
+ foreach (var textIds in Processor.GetTextIds(options.TextInput))
+ {
+ var result = await RunInferenceAsync(textIds, voiceStyle, options.Steps, options.Speed);
+ if (audioBuffer.Count == 0)
+ {
+ audioBuffer.AddRange(result.Audio.Memory.Span);
+ totalDuration = result.Duration;
+ }
+ else
+ {
+ audioBuffer.AddRange(silenceBuffer);
+ audioBuffer.AddRange(result.Audio.Memory.Span);
+ totalDuration += result.Duration + options.SilenceDuration;
+ }
+ }
+
+ var audioSpan = CollectionsMarshal.AsSpan(audioBuffer);
+ var audioLength = (int)(Configuration.SampleRate * totalDuration);
+ var audioTensor = new Tensor([1, audioLength]);
+ audioSpan[..Math.Min(audioLength, audioSpan.Length)].CopyTo(audioTensor.Memory.Span);
+ return audioTensor.AsAudioTensor(Configuration.SampleRate);
+ }
+
+
+ ///
+ /// Run inference as an asynchronous operation.
+ ///
+ /// The text ids.
+ /// The style.
+ /// The total step.
+ /// The speed.
+ /// The cancellation token.
+ private async Task RunInferenceAsync(Tensor textIds, VoiceStyle style, int totalStep, float speed = 1.05f, CancellationToken cancellationToken = default)
+ {
+ var predictionResult = await PredictAsync(textIds, style.Dropout, cancellationToken);
+ var duration = predictionResult.Memory.Span[0] / speed;
+ var encoderResult = await EncodeAsync(textIds, style.Global, cancellationToken);
+ var latents = PrepareLatents(duration);
+ for (int step = 0; step < totalStep; step++)
+ {
+ latents = await EstimateAsync(latents, encoderResult, style.Global, step, totalStep, cancellationToken);
+ }
+ var decoderResult = await DecodeAsync(latents, cancellationToken);
+ return new InferenceResult(decoderResult, duration);
+ }
+
+
+ ///
+ /// Run duration prediction model
+ ///
+ /// The text ids.
+ /// The style dropout.
+ /// The cancellation token.
+ private async Task> PredictAsync(Tensor textIds, Tensor styleDropout, CancellationToken cancellationToken = default)
+ {
+ var metadata = await Prediction.LoadAsync();
+ var textMask = new Tensor([1, 1, textIds.Dimensions[1]], 1f);
+ using (var parameters = new ModelParameters(metadata, cancellationToken))
+ {
+ parameters.AddInput(textIds);
+ parameters.AddInput(styleDropout);
+ parameters.AddInput(textMask);
+ parameters.AddOutput([1]);
+ using (var result = await Prediction.RunInferenceAsync(parameters))
+ {
+ return result[0].ToTensor();
+ }
+ }
+ }
+
+ ///
+ /// Run text encoder model
+ ///
+ /// The text ids.
+ /// The style global.
+ /// The cancellation token.
+ private async Task> EncodeAsync(Tensor textIds, Tensor styleGlobal, CancellationToken cancellationToken = default)
+ {
+ var metadata = await Encoder.LoadAsync();
+ var textMask = new Tensor([1, 1, textIds.Dimensions[1]], 1f);
+ using (var parameters = new ModelParameters(metadata, cancellationToken))
+ {
+ parameters.AddInput(textIds);
+ parameters.AddInput(styleGlobal);
+ parameters.AddInput(textMask);
+ parameters.AddOutput([1, Configuration.TextEmbedSize, textIds.Dimensions[1]]);
+ using (var result = await Encoder.RunInferenceAsync(parameters))
+ {
+ return result[0].ToTensor();
+ }
+ }
+ }
+
+ ///
+ /// Run vector estimate model
+ ///
+ /// The latents.
+ /// The text embeds.
+ /// The style global.
+ /// The step.
+ /// The steps.
+ /// The cancellation token.
+ private async Task> EstimateAsync(Tensor latents, Tensor textEmbeds, Tensor styleGlobal, int step, int steps, CancellationToken cancellationToken = default)
+ {
+ var metadata = await Estimator.LoadAsync();
+ var textMask = new Tensor([1, 1, textEmbeds.Dimensions[2]], 1f);
+ var latentMask = new Tensor([1, 1, latents.Dimensions[2]], 1f);
+ using (var parameters = new ModelParameters(metadata, cancellationToken))
+ {
+ parameters.AddInput(latents);
+ parameters.AddInput(textEmbeds);
+ parameters.AddInput(styleGlobal);
+ parameters.AddInput(latentMask);
+ parameters.AddInput(textMask);
+ parameters.AddScalarInput(step);
+ parameters.AddScalarInput(steps);
+ parameters.AddOutput(latents.Dimensions);
+ using (var vectorEstResult = await Estimator.RunInferenceAsync(parameters))
+ {
+ return vectorEstResult[0].ToTensor();
+ }
+ }
+ }
+
+
+ ///
+ /// Run decoder model
+ ///
+ /// The latents.
+ /// The cancellation token.
+ private async Task> DecodeAsync(Tensor latents, CancellationToken cancellationToken = default)
+ {
+ var metadata = await Decoder.LoadAsync();
+ var bufferSize = Configuration.ScaleFactor * latents.Dimensions[2];
+ using (var parameters = new ModelParameters(metadata, cancellationToken))
+ {
+ parameters.AddInput(latents);
+ parameters.AddOutput([1, bufferSize]);
+ using (var result = await Decoder.RunInferenceAsync(parameters))
+ {
+ return result[0].ToTensor();
+ }
+ }
+ }
+
+
+ ///
+ /// Prepares the latents.
+ ///
+ /// The duration.
+ private Tensor PrepareLatents(float duration)
+ {
+ var audioLength = duration * Configuration.SampleRate;
+ var chunkSize = Configuration.BaseChunkSize * Configuration.ChunkCompressFactor;
+ var latentLen = (int)((audioLength + chunkSize - 1) / chunkSize);
+ var latentDim = Configuration.LatentDim * Configuration.ChunkCompressFactor;
+ var latents = _random.NextTensor([1, latentDim, latentLen]);
+ return latents;
+ }
+
+
+ ///
+ /// Performs application-defined tasks associated with freeing, releasing, or resetting unmanaged resources.
+ ///
+ public void Dispose()
+ {
+ Prediction.Dispose();
+ Encoder.Dispose();
+ Estimator.Dispose();
+ Decoder.Dispose();
+ }
+
+
+ ///
+ /// Creates the SupertonicPipeline
+ ///
+ /// The model path.
+ /// The provider.
+ /// SupertonicPipeline.
+ public static SupertonicPipeline Create(string modelPath, ExecutionProvider provider)
+ {
+ var config = new SupertonicConfig
+ {
+ LatentDim = 24,
+ SampleRate = 44100,
+ ScaleFactor = 3072,
+ BaseChunkSize = 512,
+ TextEmbedSize = 256,
+ ChunkCompressFactor = 6,
+ VoiceStylePath = Path.Combine(modelPath, "voice_styles"),
+ IndexerPath = Path.Combine(modelPath, "unicode_indexer.json"),
+ PredictorConfig = new ModelConfig
+ {
+ ExecutionProvider = provider,
+ Path = Path.Combine(modelPath, "duration_predictor.onnx")
+ },
+ EncoderConfig = new ModelConfig
+ {
+ ExecutionProvider = provider,
+ Path = Path.Combine(modelPath, "text_encoder.onnx")
+ },
+ EstimatorConfig = new ModelConfig
+ {
+ ExecutionProvider = provider,
+ Path = Path.Combine(modelPath, "vector_estimator.onnx")
+ },
+ DecoderConfig = new ModelConfig
+ {
+ ExecutionProvider = provider,
+ Path = Path.Combine(modelPath, "vocoder.onnx"),
+ }
+ };
+ return new SupertonicPipeline(config);
+ }
+
+ private record InferenceResult(Tensor Audio, float Duration);
+ }
+}
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicProcessor.cs b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicProcessor.cs
new file mode 100644
index 0000000..feeb5c7
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/SupertonicProcessor.cs
@@ -0,0 +1,403 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Text.Json;
+using System.Text.Json.Serialization;
+using System.Text.RegularExpressions;
+using TensorStack.Common.Tensor;
+
+namespace TensorStack.TextGeneration.Pipelines.Supertonic
+{
+ ///
+ /// Handle input text & voice styles for Supertonic.
+ ///
+ public class SupertonicProcessor
+ {
+ private readonly IReadOnlyDictionary _indexer;
+ private readonly IReadOnlyDictionary _voiceStyles;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The indexer path.
+ /// The voice style path.
+ /// No valid indexer file found
+ /// No valid voice files found
+ public SupertonicProcessor(string indexerPath, string voiceStylePath)
+ {
+ _indexer = LoadIndexer(indexerPath);
+ _voiceStyles = LoadVoiceStyles(voiceStylePath);
+ if (_indexer.Count == 0)
+ throw new Exception("No valid indexer file found");
+ if (_voiceStyles.Count == 0)
+ throw new Exception("No valid voice files found");
+ }
+
+ ///
+ /// Gets the voice styles.
+ ///
+ public IEnumerable VoiceStyles => _voiceStyles.Keys;
+
+
+ ///
+ /// Gets the TextIds in processable chunks.
+ ///
+ /// The text input.
+ /// List<Tensor<System.Int64>>.
+ public List> GetTextIds(string textInput)
+ {
+ var textInputChunks = new List>();
+ foreach (var textInputChunk in ChunkText(textInput))
+ {
+ textInputChunks.Add(GetTextIdsInternal(textInputChunk));
+ }
+ return textInputChunks;
+ }
+
+
+ ///
+ /// Gets the voice style.
+ ///
+ /// Name of the style.
+ /// VoiceStyle.
+ public VoiceStyle GetVoiceStyle(string styleName)
+ {
+ if (string.IsNullOrEmpty(styleName) || !_voiceStyles.ContainsKey(styleName))
+ return _voiceStyles.Values.First();
+
+ return _voiceStyles[styleName];
+ }
+
+
+ ///
+ /// Gets the textds.
+ ///
+ /// The text input.
+ private Tensor GetTextIdsInternal(string textInput)
+ {
+ var processedText = PreprocessText(textInput);
+ var unicodeVals = TextToUnicodeValues(processedText);
+ var textIds = new Tensor([1, processedText.Length]);
+ for (int j = 0; j < unicodeVals.Length; j++)
+ {
+ if (_indexer.TryGetValue(unicodeVals[j], out long val))
+ {
+ textIds[0, j] = val;
+ }
+ }
+ return textIds;
+ }
+
+
+ ///
+ /// Split text input into processable chunks.
+ ///
+ /// The text.
+ /// The maximum length.
+ private static List ChunkText(string text, int maxLen = 300)
+ {
+ var chunks = new List();
+
+ // Split by paragraph (two or more newlines)
+ var paragraphRegex = new Regex(@"\n\s*\n+");
+ var paragraphs = paragraphRegex.Split(text.Trim())
+ .Select(p => p.Trim())
+ .Where(p => !string.IsNullOrEmpty(p))
+ .ToList();
+
+ // Split by sentence boundaries, excluding abbreviations
+ var sentenceRegex = new Regex(@"(?
+ /// Convert Texts to unicode values.
+ ///
+ /// The text.
+ private static int[] TextToUnicodeValues(string text)
+ {
+ return [.. text.Select(c => (int)c)];
+ }
+
+
+ ///
+ /// Removes the emojis.
+ ///
+ /// The text.
+ private static string RemoveEmojis(string text)
+ {
+ var sb = new StringBuilder(text.Length);
+
+ for (int i = 0; i < text.Length; i++)
+ {
+ int codePoint;
+
+ // Surrogate pair?
+ if (char.IsHighSurrogate(text[i]) && i + 1 < text.Length && char.IsLowSurrogate(text[i + 1]))
+ {
+ codePoint = char.ConvertToUtf32(text[i], text[i + 1]);
+ i++; // skip low surrogate
+ }
+ else
+ {
+ codePoint = text[i];
+ }
+
+ if (IsEmoji(codePoint))
+ continue; // skip
+
+ // re-append character
+ if (codePoint > 0xFFFF)
+ sb.Append(char.ConvertFromUtf32(codePoint));
+ else
+ sb.Append((char)codePoint);
+ }
+
+ return sb.ToString();
+ }
+
+
+ ///
+ /// Determines whether the specified code point is emoji.
+ ///
+ /// The code point.
+ private static bool IsEmoji(int codePoint)
+ {
+ // Covers all major emoji blocks
+ return
+ (codePoint >= 0x1F000 && codePoint <= 0x1FAFF) || // primary emoji planes
+ (codePoint >= 0x2600 && codePoint <= 0x27BF) || // dingbats & misc symbols
+ (codePoint >= 0xFE00 && codePoint <= 0xFE0F) || // variation selectors
+ (codePoint >= 0x1F1E6 && codePoint <= 0x1F1FF); // flags (regional indicators)
+ }
+
+
+ ///
+ /// Preprocesses the text.
+ ///
+ /// The text.
+ /// System.String.
+ private static string PreprocessText(string text)
+ {
+ // TODO: Need advanced normalizer for better performance
+ text = text.Normalize(NormalizationForm.FormKD);
+
+ // FIXME: this should be fixed for non-English languages
+
+ // Remove emojis (wide Unicode range)
+ // C# doesn't support \u{...} syntax in regex, so we use character filtering instead
+ text = RemoveEmojis(text);
+
+ // Replace various dashes and symbols
+ var replacements = new Dictionary
+ {
+ {"–", "-"}, // en dash
+ {"‑", "-"}, // non-breaking hyphen
+ {"—", "-"}, // em dash
+ {"¯", " "}, // macron
+ {"_", " "}, // underscore
+ {"\u201C", "\""}, // left double quote
+ {"\u201D", "\""}, // right double quote
+ {"\u2018", "'"}, // left single quote
+ {"\u2019", "'"}, // right single quote
+ {"´", "'"}, // acute accent
+ {"`", "'"}, // grave accent
+ {"[", " "}, // left bracket
+ {"]", " "}, // right bracket
+ {"|", " "}, // vertical bar
+ {"/", " "}, // slash
+ {"#", " "}, // hash
+ {"→", " "}, // right arrow
+ {"←", " "}, // left arrow
+ };
+
+ foreach (var kvp in replacements)
+ {
+ text = text.Replace(kvp.Key, kvp.Value);
+ }
+
+ // Remove combining diacritics // FIXME: this should be fixed for non-English languages
+ text = Regex.Replace(text, @"[\u0302\u0303\u0304\u0305\u0306\u0307\u0308\u030A\u030B\u030C\u0327\u0328\u0329\u032A\u032B\u032C\u032D\u032E\u032F]", "");
+
+ // Remove special symbols
+ text = Regex.Replace(text, @"[♥☆♡©\\]", "");
+
+ // Replace known expressions
+ var exprReplacements = new Dictionary
+ {
+ {"@", " at "},
+ {"e.g.,", "for example, "},
+ {"i.e.,", "that is, "},
+ };
+
+ foreach (var kvp in exprReplacements)
+ {
+ text = text.Replace(kvp.Key, kvp.Value);
+ }
+
+ // Fix spacing around punctuation
+ text = Regex.Replace(text, @" ,", ",");
+ text = Regex.Replace(text, @" \.", ".");
+ text = Regex.Replace(text, @" !", "!");
+ text = Regex.Replace(text, @" \?", "?");
+ text = Regex.Replace(text, @" ;", ";");
+ text = Regex.Replace(text, @" :", ":");
+ text = Regex.Replace(text, @" '", "'");
+
+ // Remove duplicate quotes
+ while (text.Contains("\"\""))
+ {
+ text = text.Replace("\"\"", "\"");
+ }
+ while (text.Contains("''"))
+ {
+ text = text.Replace("''", "'");
+ }
+ while (text.Contains("``"))
+ {
+ text = text.Replace("``", "`");
+ }
+
+ // Remove extra spaces
+ text = Regex.Replace(text, @"\s+", " ").Trim();
+
+ // If text doesn't end with punctuation, quotes, or closing brackets, add a period
+ if (!Regex.IsMatch(text, @"[.!?;:,'\u0022\u201C\u201D\u2018\u2019)\]}…。」』】〉》›»]$"))
+ {
+ text += ".";
+ }
+
+ return text;
+ }
+
+
+ ///
+ /// Creates a tensor from VoiceData.
+ ///
+ /// The style data.
+ /// Tensor<System.Single>.
+ private static Tensor CreateTensor(VoiceDataJson styleData)
+ {
+ var idx = 0;
+ var dims = styleData.Dimensions;
+ var tensor = new Tensor(dims);
+ for (int b = 0; b < dims[0]; b++)
+ for (int d = 0; d < dims[1]; d++)
+ for (int t = 0; t < dims[2]; t++)
+ tensor.Memory.Span[idx++] = styleData.Data[b][d][t];
+ return tensor;
+ }
+
+
+ ///
+ /// Loads the voice styles.
+ ///
+ /// The style path.
+ private static Dictionary LoadVoiceStyles(string stylePath)
+ {
+ var voiceStyles = new Dictionary();
+ foreach (var styleFile in Directory.EnumerateFiles(stylePath, "*.json", SearchOption.TopDirectoryOnly))
+ {
+ try
+ {
+ using (var jsonReader = File.OpenRead(styleFile))
+ {
+ var voiceStyle = JsonSerializer.Deserialize(jsonReader);
+ var globalTensor = CreateTensor(voiceStyle.Global);
+ var dropoutTensor = CreateTensor(voiceStyle.Dropout);
+ var name = Path.GetFileNameWithoutExtension(styleFile);
+ voiceStyles.Add(name, new VoiceStyle(name, globalTensor, dropoutTensor));
+ }
+
+ }
+ catch (Exception)
+ {
+ // TODO:
+ }
+ }
+ return voiceStyles;
+ }
+
+
+ ///
+ /// Loads the indexer.
+ ///
+ /// The indexer path.
+ private static Dictionary LoadIndexer(string indexerPath)
+ {
+ var indexer = new Dictionary();
+ using (var jsonReader = File.OpenRead(indexerPath))
+ {
+ var indexerArray = JsonSerializer.Deserialize(jsonReader);
+ for (int i = 0; i < indexerArray.Length; i++)
+ {
+ indexer.Add(i, indexerArray[i]);
+ }
+ return indexer;
+ }
+ }
+
+
+ private record VoiceStyleJson
+ {
+ [JsonPropertyName("style_ttl")]
+ public VoiceDataJson Global { get; set; }
+
+ [JsonPropertyName("style_dp")]
+ public VoiceDataJson Dropout { get; set; }
+ }
+
+
+ private record VoiceDataJson
+ {
+ [JsonPropertyName("data")]
+ public float[][][] Data { get; set; }
+
+ [JsonPropertyName("dims")]
+ public int[] Dimensions { get; set; }
+ }
+ }
+}
diff --git a/TensorStack.TextGeneration/Pipelines/Supertonic/VoiceStyle.cs b/TensorStack.TextGeneration/Pipelines/Supertonic/VoiceStyle.cs
new file mode 100644
index 0000000..1316444
--- /dev/null
+++ b/TensorStack.TextGeneration/Pipelines/Supertonic/VoiceStyle.cs
@@ -0,0 +1,18 @@
+using TensorStack.Common.Tensor;
+
+namespace TensorStack.TextGeneration.Pipelines.Supertonic
+{
+ public record VoiceStyle
+ {
+ public VoiceStyle(string name, Tensor global, Tensor dropout)
+ {
+ Name = name;
+ Global = global;
+ Dropout = dropout;
+ }
+
+ public string Name { get; init; }
+ public Tensor Global { get; init; }
+ public Tensor Dropout { get; init; }
+ }
+}