com.rest.elevenlabs 3.2.1 (#54)

- make TextToSpeechRequest public - fix text encoding bug
RageAgainstThePixel · Dec 15, 2023 · 3c410a7 · 3c410a7
1 parent 19515c4
commit 3c410a7
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 62 deletions.
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/Model.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/Model.cs
@@ -1,5 +1,6 @@
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 
+using System;
 using Newtonsoft.Json;
 using System.Collections.Generic;
 using UnityEngine.Scripting;
@@ -79,12 +80,40 @@ public Model(string id)
 
         [Preserve]
         [JsonIgnore]
-        public static Model MonoLingualV1 { get; } = new Model("eleven_monolingual_v1");
+        [Obsolete("Use EnglishV1")]
+        public static Model MonoLingualV1 => EnglishV1;
 
+        /// <summary>
+        /// Use our standard English language model to generate speech in a variety of voices, styles and moods.
+        /// </summary>
+        [Preserve]
+        [JsonIgnore]
+        public static Model EnglishV1 { get; } = new Model("eleven_monolingual_v1");
+
+        /// <summary>
+        /// Speech to speech model suitable for scenarios where you need maximum control over the content and prosody of your generations.
+        /// </summary>
+        [Preserve]
+        [JsonIgnore]
+        public static Model EnglishV2 { get; } = new Model("eleven_english_sts_v2");
+
+        /// <summary>
+        /// Cutting-edge turbo model is ideally suited for tasks demanding extremely low latency.
+        /// </summary>
+        [Preserve]
+        [JsonIgnore]
+        public static Model EnglishTurboV2 { get; } = new Model("eleven_turbo_v2");
+
+        /// <summary>
+        /// Generate lifelike speech in multiple languages and create content that resonates with a broader audience.
+        /// </summary>
         [Preserve]
         [JsonIgnore]
         public static Model MultiLingualV1 { get; } = new Model("eleven_multilingual_v1");
 
+        /// <summary>
+        /// State of the art multilingual speech synthesis model, able to generate life-like speech in 29 languages.
+        /// </summary>
         [Preserve]
         [JsonIgnore]
         public static Model MultiLingualV2 { get; } = new Model("eleven_multilingual_v2");

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
@@ -7,6 +7,7 @@
 using System;
 using System.Collections.Generic;
 using System.IO;
+using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
 using UnityEngine;
@@ -63,38 +64,43 @@ public sealed class TextToSpeechEndpoint : ElevenLabsBaseEndPoint
         /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
         /// <returns><see cref="VoiceClip"/>.</returns>
         public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default)
-        {
-            ValidateInputs(text, voice);
+            => await TextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), cancellationToken);
 
-            var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
-            var request = new TextToSpeechRequest(text, model, defaultVoiceSettings);
+        /// <summary>
+        /// Converts text into speech using a voice of your choice and returns audio.
+        /// </summary>
+        /// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
+        /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
+        /// <returns><see cref="VoiceClip"/>.</returns>
+        public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, CancellationToken cancellationToken = default)
+        {
             var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions);
             var parameters = new Dictionary<string, string>
             {
-                { OutputFormatParameter, outputFormat.ToString().ToLower() }
+                { OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
             };
 
-            if (optimizeStreamingLatency.HasValue)
+            if (request.OptimizeStreamingLatency.HasValue)
             {
-                parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.Value.ToString());
+                parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
             }
 
-            var response = await Rest.PostAsync(GetUrl($"/{voice.Id}", parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
+            var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}", parameters), payload, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
             response.Validate(EnableDebug);
 
             if (!response.Headers.TryGetValue(HistoryItemId, out var clipId))
             {
                 throw new ArgumentException("Failed to parse clip id!");
             }
 
-            var audioType = outputFormat.GetAudioType();
+            var audioType = request.OutputFormat.GetAudioType();
             var extension = audioType switch
             {
                 AudioType.MPEG => "mp3",
                 AudioType.OGGVORBIS => "ogg",
                 _ => throw new ArgumentOutOfRangeException($"Unsupported {nameof(AudioType)}: {audioType}")
             };
-            var downloadDirectory = await GetCacheDirectoryAsync(voice);
+            var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
             var cachedPath = $"{downloadDirectory}/{clipId}.{extension}";
 
             if (!File.Exists(cachedPath))
@@ -106,13 +112,13 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
                         break;
                     case AudioType.OGGVORBIS:
                         var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
-                        var frequency = outputFormat switch
+                        var frequency = request.OutputFormat switch
                         {
                             OutputFormat.PCM_16000 => 16000,
                             OutputFormat.PCM_22050 => 22050,
                             OutputFormat.PCM_24000 => 24000,
                             OutputFormat.PCM_44100 => 44100,
-                            _ => throw new ArgumentOutOfRangeException(nameof(outputFormat), outputFormat, null)
+                            _ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null)
                         };
                         var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
                         await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken).ConfigureAwait(false);
@@ -124,7 +130,7 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
 
             await Awaiters.UnityMainThread;
             var audioClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", audioType, debug: EnableDebug, cancellationToken: cancellationToken);
-            return new VoiceClip(clipId, text, voice, audioClip, cachedPath);
+            return new VoiceClip(clipId, request.Text, request.Voice, audioClip, cachedPath);
         }
 
         /// <summary>
@@ -167,36 +173,47 @@ public async Task<VoiceClip> TextToSpeechAsync(string text, Voice voice, VoiceSe
         /// </param>
         /// <returns>Downloaded clip path, and the loaded audio clip.</returns>
         public async Task<VoiceClip> StreamTextToSpeechAsync(string text, Voice voice, Action<AudioClip> partialClipCallback, VoiceSettings voiceSettings = null, Model model = null, OutputFormat outputFormat = OutputFormat.PCM_24000, int? optimizeStreamingLatency = null, CancellationToken cancellationToken = default)
-        {
-            ValidateInputs(text, voice);
+            => await StreamTextToSpeechAsync(new TextToSpeechRequest(voice, text, Encoding.UTF8, voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken), outputFormat, optimizeStreamingLatency, model), partialClipCallback, cancellationToken);
 
-            var frequency = outputFormat switch
+        /// <summary>
+        /// Converts text into speech using a voice of your choice and returns audio as an audio stream.
+        /// </summary>
+        /// <param name="request"><see cref="TextToSpeechRequest"/>.</param>
+        /// <param name="partialClipCallback">
+        /// Optional, Callback to enable streaming audio as it comes in.<br/>
+        /// Returns partial <see cref="VoiceClip"/>.
+        /// </param>
+        /// <param name="cancellationToken">
+        /// Optional, <see cref="CancellationToken"/>.
+        /// </param>
+        /// <returns>Downloaded clip path, and the loaded audio clip.</returns>
+        public async Task<VoiceClip> StreamTextToSpeechAsync(TextToSpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
+        {
+            var frequency = request.OutputFormat switch
             {
-                OutputFormat.MP3_44100_64 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
-                OutputFormat.MP3_44100_96 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
-                OutputFormat.MP3_44100_128 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
-                OutputFormat.MP3_44100_192 => throw new InvalidOperationException($"{nameof(outputFormat)} must be a PCM format for streaming!"),
+                OutputFormat.MP3_44100_64 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
+                OutputFormat.MP3_44100_96 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
+                OutputFormat.MP3_44100_128 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
+                OutputFormat.MP3_44100_192 => throw new InvalidOperationException($"{nameof(request.OutputFormat)} must be a PCM format for streaming!"),
                 OutputFormat.PCM_16000 => 16000,
                 OutputFormat.PCM_22050 => 22050,
                 OutputFormat.PCM_24000 => 24000,
                 OutputFormat.PCM_44100 => 44100,
-                _ => throw new ArgumentOutOfRangeException(nameof(outputFormat), outputFormat, null)
+                _ => throw new ArgumentOutOfRangeException(nameof(request.OutputFormat), request.OutputFormat, null)
             };
-            var defaultVoiceSettings = voiceSettings ?? voice.Settings ?? await client.VoicesEndpoint.GetDefaultVoiceSettingsAsync(cancellationToken);
-            var request = new TextToSpeechRequest(text, model, defaultVoiceSettings);
             var payload = JsonConvert.SerializeObject(request, ElevenLabsClient.JsonSerializationOptions);
             var parameters = new Dictionary<string, string>
             {
-                { OutputFormatParameter, outputFormat.ToString().ToLower() }
+                { OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
             };
 
-            if (optimizeStreamingLatency.HasValue)
+            if (request.OptimizeStreamingLatency.HasValue)
             {
-                parameters.Add(OptimizeStreamingLatencyParameter, optimizeStreamingLatency.Value.ToString());
+                parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
             }
 
             var part = 0;
-            var response = await Rest.PostAsync(GetUrl($"/{voice.Id}/stream", parameters), payload, StreamCallback, eventChunkSize: 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken).ConfigureAwait(true);
+            var response = await Rest.PostAsync(GetUrl($"/{request.Voice.Id}/stream", parameters), payload, StreamCallback, eventChunkSize: 8192, new RestParameters(client.DefaultRequestHeaders), cancellationToken).ConfigureAwait(true);
             response.Validate(EnableDebug);
 
             if (!response.Headers.TryGetValue(HistoryItemId, out var clipId))
@@ -207,12 +224,12 @@ public async Task<VoiceClip> StreamTextToSpeechAsync(string text, Voice voice, A
             var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
             var fullClip = AudioClip.Create(clipId, pcmData.Length, 1, frequency, false);
             var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
-            var downloadDirectory = await GetCacheDirectoryAsync(voice);
+            var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
             var cachedPath = $"{downloadDirectory}/{clipId}.ogg";
             await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false);
             await Awaiters.UnityMainThread;
             await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, debug: EnableDebug, cancellationToken: cancellationToken);
-            return new VoiceClip(clipId, text, voice, fullClip, cachedPath);
+            return new VoiceClip(clipId, request.Text, request.Voice, fullClip, cachedPath);
 
             void StreamCallback(Response partialResponse)
             {
@@ -242,25 +259,6 @@ void StreamCallback(Response partialResponse)
             }
         }
 
-        private static void ValidateInputs(string text, Voice voice)
-        {
-            if (string.IsNullOrWhiteSpace(text))
-            {
-                throw new ArgumentNullException(nameof(text));
-            }
-
-            if (text.Length > 5000)
-            {
-                throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
-            }
-
-            if (voice == null ||
-                string.IsNullOrWhiteSpace(voice.Id))
-            {
-                throw new ArgumentNullException(nameof(voice));
-            }
-        }
-
         private static async Task<string> GetCacheDirectoryAsync(Voice voice)
         {
             await Rest.ValidateCacheDirectoryAsync();

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechRequest.cs
@@ -10,27 +10,38 @@
 namespace ElevenLabs.TextToSpeech
 {
     [Preserve]
-    internal sealed class TextToSpeechRequest
+    public sealed class TextToSpeechRequest
     {
-        [JsonConstructor]
-        public TextToSpeechRequest(
-            [JsonProperty("text")] string text,
-            [JsonProperty("model_id")] Model model,
-            [JsonProperty("voice_settings")] VoiceSettings voiceSettings)
+        [Preserve]
+        public TextToSpeechRequest(Voice voice, string text, Encoding encoding = null, VoiceSettings voiceSettings = null, OutputFormat outputFormat = OutputFormat.MP3_44100_128, int? optimizeStreamingLatency = null, Model model = null)
         {
             if (string.IsNullOrWhiteSpace(text))
             {
                 throw new ArgumentNullException(nameof(text));
             }
 
-            if (!Encoding.GetEncoding(text).Equals(Encoding.UTF8))
+            if (text.Length > 5000)
+            {
+                throw new ArgumentOutOfRangeException(nameof(text), $"{nameof(text)} cannot exceed 5000 characters");
+            }
+
+            if (voice == null ||
+                string.IsNullOrWhiteSpace(voice.Id))
+            {
+                throw new ArgumentNullException(nameof(voice));
+            }
+
+            if (encoding?.Equals(Encoding.UTF8) == false)
             {
-                text = Encoding.UTF8.GetString(Encoding.Default.GetBytes(text));
+                text = Encoding.UTF8.GetString(encoding.GetBytes(text));
             }
 
             Text = text;
-            Model = model ?? Models.Model.MonoLingualV1;
-            VoiceSettings = voiceSettings ?? throw new ArgumentNullException(nameof(voiceSettings));
+            Model = model ?? Models.Model.MultiLingualV2;
+            Voice = voice;
+            VoiceSettings = voiceSettings ?? voice.Settings ?? throw new ArgumentNullException(nameof(voiceSettings));
+            OutputFormat = outputFormat;
+            OptimizeStreamingLatency = optimizeStreamingLatency;
         }
 
         [Preserve]
@@ -41,8 +52,20 @@ internal sealed class TextToSpeechRequest
         [JsonProperty("model_id")]
         public string Model { get; }
 
+        [Preserve]
+        [JsonIgnore]
+        public Voice Voice { get; }
+
         [Preserve]
         [JsonProperty("voice_settings")]
         public VoiceSettings VoiceSettings { get; internal set; }
+
+        [Preserve]
+        [JsonIgnore]
+        public OutputFormat OutputFormat { get; }
+
+        [Preserve]
+        [JsonIgnore]
+        public int? OptimizeStreamingLatency { get; }
     }
 }
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
@@ -1,5 +1,6 @@
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 
+using ElevenLabs.Models;
 using ElevenLabs.Voices;
 using System;
 using System.Collections.Generic;
@@ -54,8 +55,7 @@ private async void Start()
                 var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(message, voice, partialClip =>
                 {
                     streamClipQueue.Enqueue(partialClip);
-                }, cancellationToken: lifetimeCancellationTokenSource.Token);
-
+                }, model: Model.EnglishTurboV2, cancellationToken: lifetimeCancellationTokenSource.Token);
                 audioSource.clip = voiceClip.AudioClip;
                 Debug.Log($"Full clip: {voiceClip.Id}");
             }

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/package.json b/ElevenLabs/Packages/com.rest.elevenlabs/package.json
@@ -3,7 +3,7 @@
   "displayName": "ElevenLabs",
   "description": "A non-official Eleven Labs voice synthesis RESTful client.",
   "keywords": [],
-  "version": "3.2.0",
+  "version": "3.2.1",
   "unity": "2021.3",
   "documentationUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs#documentation",
   "changelogUrl": "https://github.com/RageAgainstThePixel/com.rest.elevenlabs/releases",