In [1]:
import os
import io
import random
from google.cloud import texttospeech

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "wira-wicara-d40ed01294ac.json"

In [2]:
# Set up the client
client = texttospeech.TextToSpeechClient()

# Words to be converted to speech with slight variations
words = ["Mobil", "Kemarin", "Lemari"]
word_variations = {
    "Mobil": ["Mobil", "mobil.", "Mobil!", "Mobiil", "Mobi'l"],
    "Kemarin": ["Kemarin", "kemarin.", "Kemarin!", "Kemar'in", "Kemaarin"],
    "Lemari": ["Lemari", "lemari.", "Lemari!", "Lem'ari", "Leemari"]
}

# Voice parameters
voices = [
    {"language_code": "id-ID", "name": "id-ID-Standard-A", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-B", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-C", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-D", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE}
]

# Create directory to store audio files
base_output_dir = "tts_output_words"
os.makedirs(base_output_dir, exist_ok=True)

# Function to generate audio files
def generate_audio(word, voice, pitch, rate, index, augmentation):
    text_input = texttospeech.SynthesisInput(text=word)
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        pitch=pitch,
        speaking_rate=rate
    )

    response = client.synthesize_speech(
        input=text_input,
        voice=voice,
        audio_config=audio_config
    )

    # Create a directory for each word
    word_dir = os.path.join(base_output_dir, word.replace("'", ""))
    os.makedirs(word_dir, exist_ok=True)

    # Save the audio file in the respective word directory
    filename = f"{word_dir}/{voice['name']}_pitch{pitch}_rate{rate}_aug{augmentation}_{index}.wav"
    with open(filename, "wb") as out:
        out.write(response.audio_content)
    print(f"Generated: {filename}")

# Generate 200 files for each word (100 original + 100 augmented)
for word in words:
    for voice in voices:
        for i in range(50):
            # Select a random variation of the word
            word_variant = random.choice(word_variations[word])
            
            # Original pitch and rate
            generate_audio(word_variant, voice, 0.0, 1.0, i, 0)
            
            # Augmented pitch and rate variations
            pitch = -2.0 + (i * 0.08)
            rate = 0.8 + (i * 0.008)
            generate_audio(word_variant, voice, pitch, rate, i, 1)

print("Dataset generation complete.")


Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch0.0_rate1.0_aug0_0.wav
Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch-2.0_rate0.8_aug1_0.wav
Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch0.0_rate1.0_aug0_1.wav
Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch-1.92_rate0.808_aug1_1.wav
Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch0.0_rate1.0_aug0_2.wav
Generated: tts_output_words\Mobil!/id-ID-Standard-A_pitch-1.84_rate0.8160000000000001_aug1_2.wav
Generated: tts_output_words\Mobiil/id-ID-Standard-A_pitch0.0_rate1.0_aug0_3.wav
Generated: tts_output_words\Mobiil/id-ID-Standard-A_pitch-1.76_rate0.8240000000000001_aug1_3.wav
Generated: tts_output_words\Mobil/id-ID-Standard-A_pitch0.0_rate1.0_aug0_4.wav
Generated: tts_output_words\Mobil/id-ID-Standard-A_pitch-1.68_rate0.8320000000000001_aug1_4.wav
Generated: tts_output_words\Mobil/id-ID-Standard-A_pitch0.0_rate1.0_aug0_5.wav
Generated: tts_output_words\Mobil/id-ID-Standard-A_pitch-1.6_rate0.