com.rest.elevenlabs 3.2.4 (#60)

- fixed streaming text to speech endpoint - voice clip returning empty - application freeze when encoding pcm to ogg - updated TextToSpeechDemo - added client.ctr override to pass in configuration - updated com.utilities.rest -> 2.5.1 - updated com.utilities.encoder.ogg -> 3.1.3
RageAgainstThePixel · Jan 29, 2024 · 53fd541 · 53fd541
1 parent b4b35eb
commit 53fd541
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 75 deletions.
diff --git a/.github/workflows/upm-subtree-split.yml b/.github/workflows/upm-subtree-split.yml
@@ -13,4 +13,4 @@ jobs:
       with:
         fetch-depth: 0
 
-    - uses: RageAgainstThePixel/upm-subtree-split@v1
+    - uses: RageAgainstThePixel/upm-subtree-split@v1.1
diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Editor/ElevenLabsDashboard.cs
@@ -41,19 +41,19 @@ private class VoiceGenerationArgs : ScriptableObject
 
                 public string voiceName;
 
-                public List<AudioClip> voiceSamples = new List<AudioClip>();
+                public List<AudioClip> voiceSamples = new();
 
-                public readonly Dictionary<string, string> labels = new Dictionary<string, string>();
+                public readonly Dictionary<string, string> labels = new();
 
                 public GeneratedVoicePreviewRequest CreateRequest()
-                    => new GeneratedVoicePreviewRequest(
+                    => new(
                         speechSynthesisTextInput,
                         generatedVoiceOptions.Genders[genderSelection],
                         generatedVoiceOptions.Accents[accentSelection],
                         generatedVoiceOptions.Ages[ageSelection]);
             }
 
-            private readonly Vector2 windowSize = new Vector2(WideColumnWidth * 4, WideColumnWidth * 3);
+            private readonly Vector2 windowSize = new(WideColumnWidth * 4, WideColumnWidth * 3);
 
             private static readonly string[] popupTabTitles = { "Voice Designer", "Voice Cloning" };
 
@@ -337,39 +337,39 @@ private async void AddVoice()
         private const string LabelControlField = "LabelControlField";
 
 
-        private static readonly GUIContent saveDirectoryContent = new GUIContent("Save Directory");
+        private static readonly GUIContent saveDirectoryContent = new("Save Directory");
 
-        private static readonly GUIContent guiTitleContent = new GUIContent($"{nameof(ElevenLabs)} Dashboard");
+        private static readonly GUIContent guiTitleContent = new($"{nameof(ElevenLabs)} Dashboard");
 
-        private static readonly GUIContent voiceContent = new GUIContent("Voice");
+        private static readonly GUIContent voiceContent = new("Voice");
 
-        private static readonly GUIContent modelContent = new GUIContent("Model");
+        private static readonly GUIContent modelContent = new("Model");
 
-        private static readonly GUIContent stabilityContent = new GUIContent("Stability");
+        private static readonly GUIContent stabilityContent = new("Stability");
 
-        private static readonly GUIContent moreVariableContent = new GUIContent("More Variable", "Increasing variability can make speech more expressive with output varying between re-generations. It can also lead to instabilities.");
+        private static readonly GUIContent moreVariableContent = new("More Variable", "Increasing variability can make speech more expressive with output varying between re-generations. It can also lead to instabilities.");
 
-        private static readonly GUIContent moreStableContent = new GUIContent("More Stable", "Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments we recommend lowering this value.");
+        private static readonly GUIContent moreStableContent = new("More Stable", "Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments we recommend lowering this value.");
 
-        private static readonly GUIContent clarityContent = new GUIContent("Clarity + Similarity Enhancement");
+        private static readonly GUIContent clarityContent = new("Clarity + Similarity Enhancement");
 
-        private static readonly GUIContent lowClarityContent = new GUIContent("Low", "Low values are recommended if background artifacts are present in generated speech.");
+        private static readonly GUIContent lowClarityContent = new("Low", "Low values are recommended if background artifacts are present in generated speech.");
 
-        private static readonly GUIContent highClarityContent = new GUIContent("High", "Recommended. High enhancement boosts overall voice clarity and target speaker similarity. Very high values can cause artifacts, so adjusting this setting to find the optimal value is encouraged.");
+        private static readonly GUIContent highClarityContent = new("High", "Recommended. High enhancement boosts overall voice clarity and target speaker similarity. Very high values can cause artifacts, so adjusting this setting to find the optimal value is encouraged.");
 
-        private static readonly GUIContent addNewSampleContent = new GUIContent("Add new Sample(s)");
+        private static readonly GUIContent addNewSampleContent = new("Add new Sample(s)");
 
-        private static readonly GUIContent downloadContent = new GUIContent("Download");
+        private static readonly GUIContent downloadContent = new("Download");
 
-        private static readonly GUIContent deleteContent = new GUIContent("Delete");
+        private static readonly GUIContent deleteContent = new("Delete");
 
-        private static readonly GUIContent refreshContent = new GUIContent("Refresh");
+        private static readonly GUIContent refreshContent = new("Refresh");
 
-        private static readonly GUIContent downloadingContent = new GUIContent("Download in progress...");
+        private static readonly GUIContent downloadingContent = new("Download in progress...");
 
-        private static readonly GUIContent keyContent = new GUIContent("Key");
+        private static readonly GUIContent keyContent = new("Key");
 
-        private static readonly GUIContent valueContent = new GUIContent("Value");
+        private static readonly GUIContent valueContent = new("Value");
 
         private static readonly string[] tabTitles = { "Speech Synthesis", "Voice Lab", "History" };
 
@@ -463,9 +463,9 @@ private static GUIStyle BoldCenteredHeaderStyle
 
         private static GeneratedVoiceOptions generatedVoiceOptions;
 
-        private static readonly ConcurrentDictionary<string, Dictionary<string, string>> voiceLabels = new ConcurrentDictionary<string, Dictionary<string, string>>();
+        private static readonly ConcurrentDictionary<string, Dictionary<string, string>> voiceLabels = new();
 
-        private static readonly ConcurrentDictionary<string, IEnumerable<AudioClip>> voiceSampleCache = new ConcurrentDictionary<string, IEnumerable<AudioClip>>();
+        private static readonly ConcurrentDictionary<string, IEnumerable<AudioClip>> voiceSampleCache = new();
 
         private static bool hasFetchedHistory;
 
@@ -477,11 +477,11 @@ private static GUIStyle BoldCenteredHeaderStyle
 
         private static bool[] historySelections;
 
-        private static readonly Stack<string> pageHistoryIds = new Stack<string>();
+        private static readonly Stack<string> pageHistoryIds = new();
 
-        private static readonly ConcurrentDictionary<string, GUIContent> historyItemLabelCache = new ConcurrentDictionary<string, GUIContent>();
+        private static readonly ConcurrentDictionary<string, GUIContent> historyItemLabelCache = new();
 
-        private static readonly ConcurrentDictionary<string, AudioClip> downloadedAudioClips = new ConcurrentDictionary<string, AudioClip>();
+        private static readonly ConcurrentDictionary<string, AudioClip> downloadedAudioClips = new();
 
         private static GUIContent audioPlayButtonContent;
 

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Authentication/ElevenLabsAuthentication.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Authentication/ElevenLabsAuthentication.cs
@@ -20,7 +20,7 @@ public sealed class ElevenLabsAuthentication : AbstractAuthentication<ElevenLabs
         /// Allows implicit casting from a string, so that a simple string API key can be provided in place of an instance of Authentication.
         /// </summary>
         /// <param name="apiKey">The API key.</param>
-        public static implicit operator ElevenLabsAuthentication(string apiKey) => new ElevenLabsAuthentication(apiKey);
+        public static implicit operator ElevenLabsAuthentication(string apiKey) => new(apiKey);
 
         /// <summary>
         /// Instantiates an empty Authentication object.

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/ElevenLabsClient.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/ElevenLabsClient.cs
@@ -15,6 +15,14 @@ namespace ElevenLabs
 {
     public sealed class ElevenLabsClient : BaseClient<ElevenLabsAuthentication, ElevenLabsSettings>
     {
+        /// <inheritdoc/>
+        public ElevenLabsClient(ElevenLabsConfiguration configuration)
+            : this(
+                configuration != null ? new ElevenLabsAuthentication(configuration) : null,
+                configuration != null ? new ElevenLabsSettings(configuration) : null)
+        {
+        }
+
         /// <summary>
         /// Creates a new client for the ElevenLabs API, handling auth and allowing for access to various API endpoints.
         /// </summary>
@@ -65,7 +73,7 @@ protected override void ValidateAuthentication()
         /// <summary>
         /// The <see cref="JsonSerializationOptions"/> to use when making calls to the API.
         /// </summary>
-        internal static JsonSerializerSettings JsonSerializationOptions { get; } = new JsonSerializerSettings
+        internal static JsonSerializerSettings JsonSerializationOptions { get; } = new()
         {
             DefaultValueHandling = DefaultValueHandling.Ignore
         };

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/Model.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/Model.cs
@@ -88,35 +88,35 @@ public Model(string id)
         /// </summary>
         [Preserve]
         [JsonIgnore]
-        public static Model EnglishV1 { get; } = new Model("eleven_monolingual_v1");
+        public static Model EnglishV1 { get; } = new("eleven_monolingual_v1");
 
         /// <summary>
         /// Speech to speech model suitable for scenarios where you need maximum control over the content and prosody of your generations.
         /// </summary>
         [Preserve]
         [JsonIgnore]
-        public static Model EnglishV2 { get; } = new Model("eleven_english_sts_v2");
+        public static Model EnglishV2 { get; } = new("eleven_english_sts_v2");
 
         /// <summary>
         /// Cutting-edge turbo model is ideally suited for tasks demanding extremely low latency.
         /// </summary>
         [Preserve]
         [JsonIgnore]
-        public static Model EnglishTurboV2 { get; } = new Model("eleven_turbo_v2");
+        public static Model EnglishTurboV2 { get; } = new("eleven_turbo_v2");
 
         /// <summary>
         /// Generate lifelike speech in multiple languages and create content that resonates with a broader audience.
         /// </summary>
         [Preserve]
         [JsonIgnore]
-        public static Model MultiLingualV1 { get; } = new Model("eleven_multilingual_v1");
+        public static Model MultiLingualV1 { get; } = new("eleven_multilingual_v1");
 
         /// <summary>
         /// State of the art multilingual speech synthesis model, able to generate life-like speech in 29 languages.
         /// </summary>
         [Preserve]
         [JsonIgnore]
-        public static Model MultiLingualV2 { get; } = new Model("eleven_multilingual_v2");
+        public static Model MultiLingualV2 { get; } = new("eleven_multilingual_v2");
 
         #endregion Predefined Models
     }

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/TextToSpeech/TextToSpeechEndpoint.cs
@@ -221,14 +221,12 @@ public async Task<VoiceClip> StreamTextToSpeechAsync(TextToSpeechRequest request
                 throw new ArgumentException("Failed to parse clip id!");
             }
 
-            var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
-            var fullClip = AudioClip.Create(clipId, pcmData.Length, 1, frequency, false);
-            var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
+            var pcmData = PCMEncoder.Decode(response.Data);
             var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
             var cachedPath = $"{downloadDirectory}/{clipId}.ogg";
-            await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false);
-            await Awaiters.UnityMainThread;
-            await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, parameters: new RestParameters(debug: EnableDebug), cancellationToken: cancellationToken);
+            var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken);
+            await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken);
+            var fullClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, parameters: new RestParameters(debug: EnableDebug), compressed: false, streamingAudio: true, cancellationToken: cancellationToken);
             return new VoiceClip(clipId, request.Text, request.Voice, fullClip, cachedPath);
 
             void StreamCallback(Response partialResponse)
@@ -240,7 +238,7 @@ void StreamCallback(Response partialResponse)
                         throw new ArgumentException("Failed to parse clip id!");
                     }
 
-                    var chunk = PCMEncoder.Decode(partialResponse.Data, PCMFormatSize.SixteenBit);
+                    var chunk = PCMEncoder.Decode(partialResponse.Data);
                     var audioClip = AudioClip.Create($"{clipId}_{++part}", chunk.Length, 1, frequency, false);
 
                     if (!audioClip.SetData(chunk, 0))

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs
@@ -103,39 +103,39 @@ public string Id
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Adam { get; } = new Voice("pNInz6obpgDQGcFmaJgB", nameof(Adam));
+        public static Voice Adam { get; } = new("pNInz6obpgDQGcFmaJgB", nameof(Adam));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Antoni { get; } = new Voice("ErXwobaYiN019PkySvjV", nameof(Antoni));
+        public static Voice Antoni { get; } = new("ErXwobaYiN019PkySvjV", nameof(Antoni));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Arnold { get; } = new Voice("VR6AewLTigWG4xSOukaG", nameof(Arnold));
+        public static Voice Arnold { get; } = new("VR6AewLTigWG4xSOukaG", nameof(Arnold));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Bella { get; } = new Voice("EXAVITQu4vr4xnSDxMaL", nameof(Bella));
+        public static Voice Bella { get; } = new("EXAVITQu4vr4xnSDxMaL", nameof(Bella));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Domi { get; } = new Voice("AZnzlk1XvdvUeBnXmlld", nameof(Domi));
+        public static Voice Domi { get; } = new("AZnzlk1XvdvUeBnXmlld", nameof(Domi));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Elli { get; } = new Voice("MF3mGyEYCl7XYWbV9V6O", nameof(Elli));
+        public static Voice Elli { get; } = new("MF3mGyEYCl7XYWbV9V6O", nameof(Elli));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Josh { get; } = new Voice("TxGEqnHWrfWFTfGW9XjX", nameof(Josh));
+        public static Voice Josh { get; } = new("TxGEqnHWrfWFTfGW9XjX", nameof(Josh));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Rachel { get; } = new Voice("21m00Tcm4TlvDq8ikWAM", nameof(Rachel));
+        public static Voice Rachel { get; } = new("21m00Tcm4TlvDq8ikWAM", nameof(Rachel));
 
         [Preserve]
         [JsonIgnore]
-        public static Voice Sam { get; } = new Voice("yoZ06aMxZJJ28mfd3POQ", nameof(Sam));
+        public static Voice Sam { get; } = new("yoZ06aMxZJJ28mfd3POQ", nameof(Sam));
 
         #endregion Premade Voices
 

diff --git a/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs b/ElevenLabs/Packages/com.rest.elevenlabs/Samples~/TextToSpeech/TextToSpeechDemo.cs
@@ -6,13 +6,21 @@
 using System.Collections.Generic;
 using System.Linq;
 using System.Threading;
+using System.Threading.Tasks;
 using UnityEngine;
+using Utilities.Async;
 
 namespace ElevenLabs.Demo
 {
     [RequireComponent(typeof(AudioSource))]
     public class TextToSpeechDemo : MonoBehaviour
     {
+        [SerializeField]
+        private ElevenLabsConfiguration configuration;
+
+        [SerializeField]
+        private bool debug = true;
+
         [SerializeField]
         private Voice voice;
 
@@ -23,9 +31,7 @@ public class TextToSpeechDemo : MonoBehaviour
         [SerializeField]
         private AudioSource audioSource;
 
-        private readonly Queue<AudioClip> streamClipQueue = new Queue<AudioClip>();
-
-        private CancellationTokenSource lifetimeCancellationTokenSource;
+        private readonly Queue<AudioClip> streamClipQueue = new();
 
         private void OnValidate()
         {
@@ -38,48 +44,64 @@ private void OnValidate()
         private async void Start()
         {
             OnValidate();
-            lifetimeCancellationTokenSource = new CancellationTokenSource();
 
             try
             {
-                var api = new ElevenLabsClient();
+                var api = new ElevenLabsClient(configuration)
+                {
+                    EnableDebug = debug
+                };
 
                 if (voice == null)
                 {
-                    api.VoicesEndpoint.EnableDebug = true;
-                    voice = (await api.VoicesEndpoint.GetAllVoicesAsync(lifetimeCancellationTokenSource.Token)).FirstOrDefault();
+                    voice = (await api.VoicesEndpoint.GetAllVoicesAsync(destroyCancellationToken)).FirstOrDefault();
                 }
 
                 streamClipQueue.Clear();
-                api.TextToSpeechEndpoint.EnableDebug = true;
+                var streamQueueCts = CancellationTokenSource.CreateLinkedTokenSource(destroyCancellationToken);
+                PlayStreamQueue(streamQueueCts.Token);
                 var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(message, voice, partialClip =>
                 {
                     streamClipQueue.Enqueue(partialClip);
-                }, model: Model.EnglishTurboV2, cancellationToken: lifetimeCancellationTokenSource.Token);
+                }, model: Model.EnglishTurboV2, cancellationToken: destroyCancellationToken);
                 audioSource.clip = voiceClip.AudioClip;
-                Debug.Log($"Full clip: {voiceClip.Id}");
+                await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying);
+                streamQueueCts.Cancel();
+
+                if (debug)
+                {
+                    Debug.Log($"Full clip: {voiceClip.Id}");
+                }
             }
             catch (Exception e)
             {
                 Debug.LogError(e);
             }
         }
 
-        private void Update()
+        private async void PlayStreamQueue(CancellationToken cancellationToken)
         {
-            if (!audioSource.isPlaying &&
-                streamClipQueue.TryDequeue(out var clip))
+            try
             {
-                Debug.Log($"Playing {clip.name}");
-                audioSource.PlayOneShot(clip);
-            }
-        }
+                await new WaitUntil(() => streamClipQueue.Count > 0);
+                var endOfFrame = new WaitForEndOfFrame();
 
-        private void OnDestroy()
-        {
-            lifetimeCancellationTokenSource?.Cancel();
-            lifetimeCancellationTokenSource?.Dispose();
-            lifetimeCancellationTokenSource = null;
+                do
+                {
+                    if (!audioSource.isPlaying &&
+                        streamClipQueue.TryDequeue(out var clip))
+                    {
+                        Debug.Log($"playing partial clip: {clip.name}");
+                        audioSource.PlayOneShot(clip);
+                    }
+
+                    await endOfFrame;
+                } while (!cancellationToken.IsCancellationRequested);
+            }
+            catch (Exception e)
+            {
+                Debug.LogError(e);
+            }
         }
     }
 }