Skip to content

Commit

Permalink
com.rest.elevenlabs 3.2.4 (#60)
Browse files Browse the repository at this point in the history
- fixed streaming text to speech endpoint
  - voice clip returning empty
  - application freeze when encoding pcm to ogg
- updated TextToSpeechDemo
- added client.ctr override to pass in configuration
- updated com.utilities.rest -> 2.5.1
- updated com.utilities.encoder.ogg -> 3.1.3
  • Loading branch information
StephenHodgson committed Jan 29, 2024
1 parent b4b35eb commit 53fd541
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 75 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/upm-subtree-split.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ jobs:
with:
fetch-depth: 0

- uses: RageAgainstThePixel/upm-subtree-split@v1
- uses: RageAgainstThePixel/upm-subtree-split@v1.1
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,19 @@ private class VoiceGenerationArgs : ScriptableObject

public string voiceName;

public List<AudioClip> voiceSamples = new List<AudioClip>();
public List<AudioClip> voiceSamples = new();

public readonly Dictionary<string, string> labels = new Dictionary<string, string>();
public readonly Dictionary<string, string> labels = new();

public GeneratedVoicePreviewRequest CreateRequest()
=> new GeneratedVoicePreviewRequest(
=> new(
speechSynthesisTextInput,
generatedVoiceOptions.Genders[genderSelection],
generatedVoiceOptions.Accents[accentSelection],
generatedVoiceOptions.Ages[ageSelection]);
}

private readonly Vector2 windowSize = new Vector2(WideColumnWidth * 4, WideColumnWidth * 3);
private readonly Vector2 windowSize = new(WideColumnWidth * 4, WideColumnWidth * 3);

private static readonly string[] popupTabTitles = { "Voice Designer", "Voice Cloning" };

Expand Down Expand Up @@ -337,39 +337,39 @@ private async void AddVoice()
private const string LabelControlField = "LabelControlField";


private static readonly GUIContent saveDirectoryContent = new GUIContent("Save Directory");
private static readonly GUIContent saveDirectoryContent = new("Save Directory");

private static readonly GUIContent guiTitleContent = new GUIContent($"{nameof(ElevenLabs)} Dashboard");
private static readonly GUIContent guiTitleContent = new($"{nameof(ElevenLabs)} Dashboard");

private static readonly GUIContent voiceContent = new GUIContent("Voice");
private static readonly GUIContent voiceContent = new("Voice");

private static readonly GUIContent modelContent = new GUIContent("Model");
private static readonly GUIContent modelContent = new("Model");

private static readonly GUIContent stabilityContent = new GUIContent("Stability");
private static readonly GUIContent stabilityContent = new("Stability");

private static readonly GUIContent moreVariableContent = new GUIContent("More Variable", "Increasing variability can make speech more expressive with output varying between re-generations. It can also lead to instabilities.");
private static readonly GUIContent moreVariableContent = new("More Variable", "Increasing variability can make speech more expressive with output varying between re-generations. It can also lead to instabilities.");

private static readonly GUIContent moreStableContent = new GUIContent("More Stable", "Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments we recommend lowering this value.");
private static readonly GUIContent moreStableContent = new("More Stable", "Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments we recommend lowering this value.");

private static readonly GUIContent clarityContent = new GUIContent("Clarity + Similarity Enhancement");
private static readonly GUIContent clarityContent = new("Clarity + Similarity Enhancement");

private static readonly GUIContent lowClarityContent = new GUIContent("Low", "Low values are recommended if background artifacts are present in generated speech.");
private static readonly GUIContent lowClarityContent = new("Low", "Low values are recommended if background artifacts are present in generated speech.");

private static readonly GUIContent highClarityContent = new GUIContent("High", "Recommended. High enhancement boosts overall voice clarity and target speaker similarity. Very high values can cause artifacts, so adjusting this setting to find the optimal value is encouraged.");
private static readonly GUIContent highClarityContent = new("High", "Recommended. High enhancement boosts overall voice clarity and target speaker similarity. Very high values can cause artifacts, so adjusting this setting to find the optimal value is encouraged.");

private static readonly GUIContent addNewSampleContent = new GUIContent("Add new Sample(s)");
private static readonly GUIContent addNewSampleContent = new("Add new Sample(s)");

private static readonly GUIContent downloadContent = new GUIContent("Download");
private static readonly GUIContent downloadContent = new("Download");

private static readonly GUIContent deleteContent = new GUIContent("Delete");
private static readonly GUIContent deleteContent = new("Delete");

private static readonly GUIContent refreshContent = new GUIContent("Refresh");
private static readonly GUIContent refreshContent = new("Refresh");

private static readonly GUIContent downloadingContent = new GUIContent("Download in progress...");
private static readonly GUIContent downloadingContent = new("Download in progress...");

private static readonly GUIContent keyContent = new GUIContent("Key");
private static readonly GUIContent keyContent = new("Key");

private static readonly GUIContent valueContent = new GUIContent("Value");
private static readonly GUIContent valueContent = new("Value");

private static readonly string[] tabTitles = { "Speech Synthesis", "Voice Lab", "History" };

Expand Down Expand Up @@ -463,9 +463,9 @@ private static GUIStyle BoldCenteredHeaderStyle

private static GeneratedVoiceOptions generatedVoiceOptions;

private static readonly ConcurrentDictionary<string, Dictionary<string, string>> voiceLabels = new ConcurrentDictionary<string, Dictionary<string, string>>();
private static readonly ConcurrentDictionary<string, Dictionary<string, string>> voiceLabels = new();

private static readonly ConcurrentDictionary<string, IEnumerable<AudioClip>> voiceSampleCache = new ConcurrentDictionary<string, IEnumerable<AudioClip>>();
private static readonly ConcurrentDictionary<string, IEnumerable<AudioClip>> voiceSampleCache = new();

private static bool hasFetchedHistory;

Expand All @@ -477,11 +477,11 @@ private static GUIStyle BoldCenteredHeaderStyle

private static bool[] historySelections;

private static readonly Stack<string> pageHistoryIds = new Stack<string>();
private static readonly Stack<string> pageHistoryIds = new();

private static readonly ConcurrentDictionary<string, GUIContent> historyItemLabelCache = new ConcurrentDictionary<string, GUIContent>();
private static readonly ConcurrentDictionary<string, GUIContent> historyItemLabelCache = new();

private static readonly ConcurrentDictionary<string, AudioClip> downloadedAudioClips = new ConcurrentDictionary<string, AudioClip>();
private static readonly ConcurrentDictionary<string, AudioClip> downloadedAudioClips = new();

private static GUIContent audioPlayButtonContent;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ public sealed class ElevenLabsAuthentication : AbstractAuthentication<ElevenLabs
/// Allows implicit casting from a string, so that a simple string API key can be provided in place of an instance of Authentication.
/// </summary>
/// <param name="apiKey">The API key.</param>
public static implicit operator ElevenLabsAuthentication(string apiKey) => new ElevenLabsAuthentication(apiKey);
public static implicit operator ElevenLabsAuthentication(string apiKey) => new(apiKey);

/// <summary>
/// Instantiates an empty Authentication object.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@ namespace ElevenLabs
{
public sealed class ElevenLabsClient : BaseClient<ElevenLabsAuthentication, ElevenLabsSettings>
{
/// <inheritdoc/>
public ElevenLabsClient(ElevenLabsConfiguration configuration)
: this(
configuration != null ? new ElevenLabsAuthentication(configuration) : null,
configuration != null ? new ElevenLabsSettings(configuration) : null)
{
}

/// <summary>
/// Creates a new client for the ElevenLabs API, handling auth and allowing for access to various API endpoints.
/// </summary>
Expand Down Expand Up @@ -65,7 +73,7 @@ protected override void ValidateAuthentication()
/// <summary>
/// The <see cref="JsonSerializationOptions"/> to use when making calls to the API.
/// </summary>
internal static JsonSerializerSettings JsonSerializationOptions { get; } = new JsonSerializerSettings
internal static JsonSerializerSettings JsonSerializationOptions { get; } = new()
{
DefaultValueHandling = DefaultValueHandling.Ignore
};
Expand Down
10 changes: 5 additions & 5 deletions ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Models/Model.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,35 +88,35 @@ public Model(string id)
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishV1 { get; } = new Model("eleven_monolingual_v1");
public static Model EnglishV1 { get; } = new("eleven_monolingual_v1");

/// <summary>
/// Speech to speech model suitable for scenarios where you need maximum control over the content and prosody of your generations.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishV2 { get; } = new Model("eleven_english_sts_v2");
public static Model EnglishV2 { get; } = new("eleven_english_sts_v2");

/// <summary>
/// Cutting-edge turbo model is ideally suited for tasks demanding extremely low latency.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model EnglishTurboV2 { get; } = new Model("eleven_turbo_v2");
public static Model EnglishTurboV2 { get; } = new("eleven_turbo_v2");

/// <summary>
/// Generate lifelike speech in multiple languages and create content that resonates with a broader audience.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model MultiLingualV1 { get; } = new Model("eleven_multilingual_v1");
public static Model MultiLingualV1 { get; } = new("eleven_multilingual_v1");

/// <summary>
/// State of the art multilingual speech synthesis model, able to generate life-like speech in 29 languages.
/// </summary>
[Preserve]
[JsonIgnore]
public static Model MultiLingualV2 { get; } = new Model("eleven_multilingual_v2");
public static Model MultiLingualV2 { get; } = new("eleven_multilingual_v2");

#endregion Predefined Models
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,12 @@ public async Task<VoiceClip> StreamTextToSpeechAsync(TextToSpeechRequest request
throw new ArgumentException("Failed to parse clip id!");
}

var pcmData = PCMEncoder.Decode(response.Data, PCMFormatSize.SixteenBit);
var fullClip = AudioClip.Create(clipId, pcmData.Length, 1, frequency, false);
var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken).ConfigureAwait(false);
var pcmData = PCMEncoder.Decode(response.Data);
var downloadDirectory = await GetCacheDirectoryAsync(request.Voice);
var cachedPath = $"{downloadDirectory}/{clipId}.ogg";
await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken).ConfigureAwait(false);
await Awaiters.UnityMainThread;
await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, parameters: new RestParameters(debug: EnableDebug), cancellationToken: cancellationToken);
var oggBytes = await OggEncoder.ConvertToBytesAsync(pcmData, frequency, 1, cancellationToken: cancellationToken);
await File.WriteAllBytesAsync(cachedPath, oggBytes, cancellationToken: cancellationToken);
var fullClip = await Rest.DownloadAudioClipAsync($"file://{cachedPath}", AudioType.OGGVORBIS, parameters: new RestParameters(debug: EnableDebug), compressed: false, streamingAudio: true, cancellationToken: cancellationToken);
return new VoiceClip(clipId, request.Text, request.Voice, fullClip, cachedPath);

void StreamCallback(Response partialResponse)
Expand All @@ -240,7 +238,7 @@ void StreamCallback(Response partialResponse)
throw new ArgumentException("Failed to parse clip id!");
}

var chunk = PCMEncoder.Decode(partialResponse.Data, PCMFormatSize.SixteenBit);
var chunk = PCMEncoder.Decode(partialResponse.Data);
var audioClip = AudioClip.Create($"{clipId}_{++part}", chunk.Length, 1, frequency, false);

if (!audioClip.SetData(chunk, 0))
Expand Down
18 changes: 9 additions & 9 deletions ElevenLabs/Packages/com.rest.elevenlabs/Runtime/Voices/Voice.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,39 +103,39 @@ public string Id

[Preserve]
[JsonIgnore]
public static Voice Adam { get; } = new Voice("pNInz6obpgDQGcFmaJgB", nameof(Adam));
public static Voice Adam { get; } = new("pNInz6obpgDQGcFmaJgB", nameof(Adam));

[Preserve]
[JsonIgnore]
public static Voice Antoni { get; } = new Voice("ErXwobaYiN019PkySvjV", nameof(Antoni));
public static Voice Antoni { get; } = new("ErXwobaYiN019PkySvjV", nameof(Antoni));

[Preserve]
[JsonIgnore]
public static Voice Arnold { get; } = new Voice("VR6AewLTigWG4xSOukaG", nameof(Arnold));
public static Voice Arnold { get; } = new("VR6AewLTigWG4xSOukaG", nameof(Arnold));

[Preserve]
[JsonIgnore]
public static Voice Bella { get; } = new Voice("EXAVITQu4vr4xnSDxMaL", nameof(Bella));
public static Voice Bella { get; } = new("EXAVITQu4vr4xnSDxMaL", nameof(Bella));

[Preserve]
[JsonIgnore]
public static Voice Domi { get; } = new Voice("AZnzlk1XvdvUeBnXmlld", nameof(Domi));
public static Voice Domi { get; } = new("AZnzlk1XvdvUeBnXmlld", nameof(Domi));

[Preserve]
[JsonIgnore]
public static Voice Elli { get; } = new Voice("MF3mGyEYCl7XYWbV9V6O", nameof(Elli));
public static Voice Elli { get; } = new("MF3mGyEYCl7XYWbV9V6O", nameof(Elli));

[Preserve]
[JsonIgnore]
public static Voice Josh { get; } = new Voice("TxGEqnHWrfWFTfGW9XjX", nameof(Josh));
public static Voice Josh { get; } = new("TxGEqnHWrfWFTfGW9XjX", nameof(Josh));

[Preserve]
[JsonIgnore]
public static Voice Rachel { get; } = new Voice("21m00Tcm4TlvDq8ikWAM", nameof(Rachel));
public static Voice Rachel { get; } = new("21m00Tcm4TlvDq8ikWAM", nameof(Rachel));

[Preserve]
[JsonIgnore]
public static Voice Sam { get; } = new Voice("yoZ06aMxZJJ28mfd3POQ", nameof(Sam));
public static Voice Sam { get; } = new("yoZ06aMxZJJ28mfd3POQ", nameof(Sam));

#endregion Premade Voices

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,21 @@
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using UnityEngine;
using Utilities.Async;

namespace ElevenLabs.Demo
{
[RequireComponent(typeof(AudioSource))]
public class TextToSpeechDemo : MonoBehaviour
{
[SerializeField]
private ElevenLabsConfiguration configuration;

[SerializeField]
private bool debug = true;

[SerializeField]
private Voice voice;

Expand All @@ -23,9 +31,7 @@ public class TextToSpeechDemo : MonoBehaviour
[SerializeField]
private AudioSource audioSource;

private readonly Queue<AudioClip> streamClipQueue = new Queue<AudioClip>();

private CancellationTokenSource lifetimeCancellationTokenSource;
private readonly Queue<AudioClip> streamClipQueue = new();

private void OnValidate()
{
Expand All @@ -38,48 +44,64 @@ private void OnValidate()
private async void Start()
{
OnValidate();
lifetimeCancellationTokenSource = new CancellationTokenSource();

try
{
var api = new ElevenLabsClient();
var api = new ElevenLabsClient(configuration)
{
EnableDebug = debug
};

if (voice == null)
{
api.VoicesEndpoint.EnableDebug = true;
voice = (await api.VoicesEndpoint.GetAllVoicesAsync(lifetimeCancellationTokenSource.Token)).FirstOrDefault();
voice = (await api.VoicesEndpoint.GetAllVoicesAsync(destroyCancellationToken)).FirstOrDefault();
}

streamClipQueue.Clear();
api.TextToSpeechEndpoint.EnableDebug = true;
var streamQueueCts = CancellationTokenSource.CreateLinkedTokenSource(destroyCancellationToken);
PlayStreamQueue(streamQueueCts.Token);
var voiceClip = await api.TextToSpeechEndpoint.StreamTextToSpeechAsync(message, voice, partialClip =>
{
streamClipQueue.Enqueue(partialClip);
}, model: Model.EnglishTurboV2, cancellationToken: lifetimeCancellationTokenSource.Token);
}, model: Model.EnglishTurboV2, cancellationToken: destroyCancellationToken);
audioSource.clip = voiceClip.AudioClip;
Debug.Log($"Full clip: {voiceClip.Id}");
await new WaitUntil(() => streamClipQueue.Count == 0 && !audioSource.isPlaying);
streamQueueCts.Cancel();

if (debug)
{
Debug.Log($"Full clip: {voiceClip.Id}");
}
}
catch (Exception e)
{
Debug.LogError(e);
}
}

private void Update()
private async void PlayStreamQueue(CancellationToken cancellationToken)
{
if (!audioSource.isPlaying &&
streamClipQueue.TryDequeue(out var clip))
try
{
Debug.Log($"Playing {clip.name}");
audioSource.PlayOneShot(clip);
}
}
await new WaitUntil(() => streamClipQueue.Count > 0);
var endOfFrame = new WaitForEndOfFrame();

private void OnDestroy()
{
lifetimeCancellationTokenSource?.Cancel();
lifetimeCancellationTokenSource?.Dispose();
lifetimeCancellationTokenSource = null;
do
{
if (!audioSource.isPlaying &&
streamClipQueue.TryDequeue(out var clip))
{
Debug.Log($"playing partial clip: {clip.name}");
audioSource.PlayOneShot(clip);
}

await endOfFrame;
} while (!cancellationToken.IsCancellationRequested);
}
catch (Exception e)
{
Debug.LogError(e);
}
}
}
}

0 comments on commit 53fd541

Please sign in to comment.