In [4]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "wira-wicara-d40ed01294ac.json"

In [5]:
from google.cloud import texttospeech
import os

# Set up the client
client = texttospeech.TextToSpeechClient()

# Sentences to be converted to speech
sentences = [
    "ibu berbelanja ke pasar malam",
    "ular melingkar di pagar bundar bundar",
    "mobil balap itu cepat sekali"
]

# Voice parameters
voices = [
    {"language_code": "id-ID", "name": "id-ID-Standard-A", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-B", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-C", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-D", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE}
]

# Create directory to store audio files
base_output_dir = "tts_output"
os.makedirs(base_output_dir, exist_ok=True)

# Function to generate audio files
def generate_audio(sentence, voice, pitch, rate, index):
    text_input = texttospeech.SynthesisInput(text=sentence)
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        pitch=pitch,
        speaking_rate=rate
    )

    response = client.synthesize_speech(
        input=text_input,
        voice=voice,
        audio_config=audio_config
    )

    # Create a directory for each sentence
    sentence_dir = os.path.join(base_output_dir, sentence.replace(" ", "_"))
    os.makedirs(sentence_dir, exist_ok=True)

    # Save the audio file in the respective sentence directory
    filename = f"{sentence_dir}/{voice['name']}_pitch{pitch}_rate{rate}_{index}.wav"
    with open(filename, "wb") as out:
        out.write(response.audio_content)
    print(f"Generated: {filename}")

# Generate X files for each sentence with different pitch and speaking rate values
for sentence in sentences:
    for voice in voices:
        for i in range(75):
            pitch = -7 + (i * 0.1)
            rate = 0.70 + (i * 0.02)
            generate_audio(sentence, voice, pitch, rate, i)

print("Dataset generation complete.")

Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-7.0_rate0.7_0.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.9_rate0.72_1.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.8_rate0.74_2.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.7_rate0.76_3.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.6_rate0.7799999999999999_4.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.5_rate0.7999999999999999_5.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.4_rate0.82_6.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.3_rate0.84_7.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.2_rate0.86_8.wav
Generated: tts_output\ibu_berbelanja_ke_pasar_malam/id-ID-Standard-A_pitch-6.1_rate0.8799999999999999_9.wav
Generated: tts_outp

In [6]:
from pydub import AudioSegment
from pydub.generators import WhiteNoise
import os
import random

In [7]:
def normalize_audio(audio, target_dBFS=-20.0, target_sample_rate=8000):
    change_in_dBFS = target_dBFS - audio.dBFS
    return audio.apply_gain(change_in_dBFS).set_frame_rate(target_sample_rate)

def add_white_noise(audio, noise_level_dB=-30):
    white_noise = WhiteNoise().to_audio_segment(duration=len(audio))
    white_noise = white_noise - abs(noise_level_dB)
    return audio.overlay(white_noise)

In [8]:
# Define input and output directories
base_output_dir = "tts_output"
normalized_output_dir = "normalized"
fused_output_dir = "fused"

os.makedirs(normalized_output_dir, exist_ok=True)
os.makedirs(fused_output_dir, exist_ok=True)

for sentence_folder in os.listdir(base_output_dir):
    sentence_folder_path = os.path.join(base_output_dir, sentence_folder)
    if os.path.isdir(sentence_folder_path):
        # Create corresponding folder in the fused output directory
        fused_sentence_folder_path = os.path.join(fused_output_dir, sentence_folder)
        os.makedirs(fused_sentence_folder_path, exist_ok=True)

        # Collect all audio files in the current sentence folder
        audio_files = [f for f in os.listdir(sentence_folder_path) if f.endswith(".wav")]

        # Normalize each audio file and save to 'normalized' folder
        for filename in audio_files:
            audio_path = os.path.join(sentence_folder_path, filename)
            original_audio = AudioSegment.from_file(audio_path)

            # Normalize audio
            normalized_audio = normalize_audio(original_audio)

            # Add white noise to normalized audio
            augmented_audio = add_white_noise(normalized_audio)

            # Save original audio to 'fused' folder
            fused_original_output_path = os.path.join(fused_sentence_folder_path, filename)
            original_audio.export(fused_original_output_path, format="wav")

            # Save normalized audio to 'fused' folder with '_norm' suffix
            normalized_filename = filename.replace(".wav", "_norm.wav")
            fused_normalized_output_path = os.path.join(fused_sentence_folder_path, normalized_filename)
            normalized_audio.export(fused_normalized_output_path, format="wav")

            # Save augmented audio to 'fused' folder with '_aug' suffix
            augmented_filename = filename.replace(".wav", "_aug.wav")
            fused_augmented_output_path = os.path.join(fused_sentence_folder_path, augmented_filename)
            augmented_audio.export(fused_augmented_output_path, format="wav")

print("Normalization, augmentation, and fusion complete.")

Normalization, augmentation, and fusion complete.
