In [30]:
from elevenlabs import ElevenLabs, play

client = ElevenLabs(api_key='sk_af7dc420ea22e0fe192d799fb6142bd14a731a0ea3e852d6')

audio = client.text_to_speech.convert(
    text="Dance my friends, dance!",
    voice_id='iCrDUkL56s3C8sCRl7wb',
    model_id="eleven_multilingual_v2",
    output_format="mp3_44100_128",
)

play(audio)


In [33]:
import requests
import os
from dotenv import load_dotenv

# Load environment variables from a .env file if available
load_dotenv()

# Retrieve your ElevenLabs API key from the environment
api_key = os.getenv("ELEVENLABS_API_KEY")
if not api_key:
    raise ValueError("ELEVENLABS_API_KEY not set. Please set it in your environment or .env file.")

# Define the endpoint URL
url = "https://api.elevenlabs.io/v1/voices"

# Set up the request headers with your API key
headers = {
    "xi-api-key": api_key
}

# Make the GET request to the endpoint
response = requests.get(url, headers=headers)

# Check for a successful response and parse the result
if response.status_code == 200:
    data = response.json()
    voices = data.get("voices", [])
    voice_ids = [voice.get("voice_id") for voice in voices]
else:
    print(f"Error: {response.status_code}")
    print(response.text)


In [40]:
import requests
import os
from dotenv import load_dotenv

def text_to_speech(voice_id, text, output_filename):
    """
    Convert text to speech using ElevenLabs API and save to file
    
    Args:
        voice_id (str): ElevenLabs voice ID to use
        text (str): Text to convert to speech
        output_filename (str): Filename to save the audio to
    """
    load_dotenv()

    # Get API key from environment
    api_key = os.getenv("ELEVENLABS_API_KEY")
    if not api_key:
        raise ValueError("ELEVENLABS_API_KEY not set.")

    # API endpoint
    url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"

    # Request payload
    payload = {
        "text": text,
        "model_id": "eleven_monolingual_v1",
        "voice_settings": {
            "stability": 0.5,
            "similarity_boost": 0.75
        }
    }

    # Headers
    headers = {
        "xi-api-key": api_key,
        "Content-Type": "application/json",
        "Accept": "audio/wav" 
    }

    # Make the POST request
    response = requests.post(url, headers=headers, json=payload, stream=True)

    if response.status_code == 200:
        with open(output_filename, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        # print(f"Audio saved to {output_filename}")
    else:
        print(f"Error: {response.status_code}")
        print(response.text)


In [None]:
# Load all sentences
from tqdm import tqdm
import os
from datasets import Dataset
import soundfile as sf

# Create speech_data directory if it doesn't exist
os.makedirs('speech_data', exist_ok=True)

with open('sentences_clean.txt', 'r') as f:
    sentences = f.readlines()
    sentences = [s.strip() for s in sentences]

# Initialize lists to store data
voice_ids_list = []
long_sentences = []
long_sentence_audios = []
short_sentences = []
short_sentence_audios = []

# For each voice, generate speech for first+last or second+second-last sentences etc
for i, voice_id in tqdm(enumerate(voice_ids), total=len(voice_ids), desc="Generating speech"):
    # Get the appropriate sentences for this voice
    long_sentence = sentences[i]
    short_sentence = sentences[-(i+1)]
    
    # Generate speech for both sentences
    long_path = os.path.join('speech_data', f"voice{i}_long.wav")
    short_path = os.path.join('speech_data', f"voice{i}_short.wav")
    
    text_to_speech(voice_id, long_sentence, long_path)
    text_to_speech(voice_id, short_sentence, short_path)
    
    # Read audio files
    long_audio, _ = sf.read(long_path)
    short_audio, _ = sf.read(short_path)
    
    # Store data
    voice_ids_list.append(i+1)
    long_sentences.append(long_sentence)
    long_sentence_audios.append(long_audio)
    short_sentences.append(short_sentence)
    short_sentence_audios.append(short_audio)

# Create dataset
dataset_dict = {
    'voice_id': voice_ids_list,
    'long_sentence': long_sentences,
    'long_sentence_audio': long_sentence_audios,
    'short_sentence': short_sentences, 
    'short_sentence_audio': short_sentence_audios
}

# Convert to HuggingFace Dataset and push to hub
dataset = Dataset.from_dict(dataset_dict)
dataset.push_to_hub("UjjD/zero_shot")


[1]


In [47]:
# Download dataset
dataset = load_dataset("UjjD/zero_shot")['train']

# Add sample rate column
dataset = dataset.add_column("sr", [44100] * len(dataset))

# Push updated dataset back to hub
dataset.push_to_hub("UjjD/zero_shot")


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.26s/ba]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:01<00:00,  1.10s/ba]
Uploading the dataset shards: 100%|██████████| 2/2 [00:17<00:00,  8.61s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/UjjD/zero_shot/commit/480b9db290565b0a8b40617925354fb852861a3b', commit_message='Upload dataset', commit_description='', oid='480b9db290565b0a8b40617925354fb852861a3b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/UjjD/zero_shot', endpoint='https://huggingface.co', repo_type='dataset', repo_id='UjjD/zero_shot'), pr_revision=None, pr_num=None)

In [48]:
# Download dataset
dataset = load_dataset("UjjD/zero_shot")['train']

# Get first row
first_row = dataset[0]

# Display the text
print("Long sentence:", first_row['long_sentence'])
print("Short sentence:", first_row['short_sentence'])

# Save the audio files
long_audio_path = "first_long.wav"
short_audio_path = "first_short.wav"

sf.write(long_audio_path, first_row['long_sentence_audio'], first_row['sr'])
sf.write(short_audio_path, first_row['short_sentence_audio'], first_row['sr'])

print(f"\nAudio files saved to {long_audio_path} and {short_audio_path}")


Generating train split: 100%|██████████| 50/50 [00:01<00:00, 44.38 examples/s]


Long sentence: Attending to the heterogeneity of experience I suggest works against the biases of both anthropocentric thinking and conscious perception especially vision which tend to exaggerate the distance between inside and outside such theories proposing that regions of our own bodiesparts of usare more deeply immersed in and immediately involved with the outside world in which we participate may enable us to become more receptive to our entanglements with unloved others and perhaps more inclined to intervene on their behalf.
Short sentence: Interestingly Massumi turns to Nietzsche to promote a concept of a subjectivity without a subject a doing without a doer.

Audio files saved to first_long.wav and first_short.wav


In [6]:
import os
import subprocess
from cartesia import Cartesia
from dotenv import load_dotenv

def text_to_speech_cartesia(voice_id, text, output_filename):

    load_dotenv()

    if os.environ.get("CARTESIA_API_KEY") is None:
        raise ValueError("CARTESIA_API_KEY is not set")

    client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))

    data = client.tts.bytes(
        model_id="sonic",
        transcript=text,
        voice_id=voice_id,
        output_format={
            "container": "wav",
            "encoding": "pcm_f32le",
            "sample_rate": 44100,
        },
    )

    with open(output_filename, "wb") as f:
        f.write(data)

In [7]:
example_voice_id = 'a3afd376-04f9-48e2-a966-132cdfdbc093'

# Read first sentence from sentences_clean.txt
with open('sentences_clean.txt', 'r') as f:
    first_sentence = f.readline().strip()

# Generate speech using example voice
text_to_speech_cartesia(
    voice_id=example_voice_id,
    text=first_sentence, 
    output_filename='bla_bla.wav'
)


In [9]:
from cartesia import Cartesia
import os

client = Cartesia(api_key=os.environ.get("CARTESIA_API_KEY"))

# Get all available voices
voices = client.voices.list()
#print(voices)

# Extract voice IDs into a list
voice_ids = [voice["id"] for voice in voices]


In [None]:
# Load all sentences
from tqdm import tqdm
import os
from datasets import Dataset, load_dataset, concatenate_datasets
import soundfile as sf

# Create speech_data directory if it doesn't exist
os.makedirs('speech_data', exist_ok=True)

with open('sentences_clean.txt', 'r') as f:
    sentences = f.readlines()
    sentences = [s.strip() for s in sentences]

# Initialize lists to store data
voice_ids_list = []
long_sentences = []
long_sentence_audios = []
short_sentences = []
short_sentence_audios = []
sample_rates = []

# Load existing dataset
existing_dataset = load_dataset("UjjD/zero_shot")["train"]

# For each voice, generate speech for 51st+51st-last or 52nd+52nd-last sentences etc
for i, voice_id in tqdm(enumerate(voice_ids), total=len(voice_ids), desc="Generating speech"):
    # Get the appropriate sentences for this voice (starting from 51st sentence)
    long_sentence = sentences[50 + i]
    short_sentence = sentences[-(51 + i)]
    
    # Generate speech for both sentences
    long_path = os.path.join('speech_data', f"voice{50+i}_long.wav")
    short_path = os.path.join('speech_data', f"voice{50+i}_short.wav")
    
    text_to_speech_cartesia(voice_id, long_sentence, long_path)
    text_to_speech_cartesia(voice_id, short_sentence, short_path)
    
    # Read audio files
    long_audio, sr_long = sf.read(long_path)
    short_audio, sr_short = sf.read(short_path)
    
    # Store data
    voice_ids_list.append(50 + i + 1)  # Start voice IDs from 51
    long_sentences.append(long_sentence)
    long_sentence_audios.append(long_audio)
    short_sentences.append(short_sentence)
    short_sentence_audios.append(short_audio)
    sample_rates.append(sr_long)  # Using long audio's sample rate (should be same for both)

# Create dataset
new_dataset_dict = {
    'voice_id': voice_ids_list,
    'long_sentence': long_sentences,
    'long_sentence_audio': long_sentence_audios,
    'short_sentence': short_sentences, 
    'short_sentence_audio': short_sentence_audios,
    'sr': sample_rates
}

# Convert to HuggingFace Dataset and concatenate with existing
new_dataset = Dataset.from_dict(new_dataset_dict)
combined_dataset = concatenate_datasets([existing_dataset, new_dataset])
combined_dataset.push_to_hub("UjjD/zero_shot")