In [1]:
!pip install google-cloud-texttospeech

Collecting google-cloud-texttospeech
  Downloading google_cloud_texttospeech-2.16.3-py2.py3-none-any.whl (151 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/152.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-cloud-texttospeech
Successfully installed google-cloud-texttospeech-2.16.3


In [2]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/drive/MyDrive/Bangkit/wira-wicara-d40ed01294ac.json"

In [4]:
from google.cloud import texttospeech
import os

# Set up the client
client = texttospeech.TextToSpeechClient()

# Sentences to be converted to speech
sentences = [
    "kucing menangis",
    "ibu melempar batu",
    "apel newton jatuh"
]

# Voice parameters
voices = [
    {"language_code": "id-ID", "name": "id-ID-Standard-A", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-B", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-C", "ssml_gender": texttospeech.SsmlVoiceGender.MALE},
    {"language_code": "id-ID", "name": "id-ID-Standard-D", "ssml_gender": texttospeech.SsmlVoiceGender.FEMALE}
]

# Create directory to store audio files
base_output_dir = "tts_output"
os.makedirs(base_output_dir, exist_ok=True)

# Function to generate audio files
def generate_audio(sentence, voice, pitch, rate, index):
    text_input = texttospeech.SynthesisInput(text=sentence)
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        sample_rate_hertz=8000,
        pitch=pitch,
        speaking_rate=rate
    )

    response = client.synthesize_speech(
        input=text_input,
        voice=voice,
        audio_config=audio_config
    )

    # Create a directory for each sentence
    sentence_dir = os.path.join(base_output_dir, sentence.replace(" ", "_"))
    os.makedirs(sentence_dir, exist_ok=True)

    # Save the audio file in the respective sentence directory
    filename = f"{sentence_dir}/{voice['name']}_pitch{pitch}_rate{rate}_{index}.wav"
    with open(filename, "wb") as out:
        out.write(response.audio_content)
    print(f"Generated: {filename}")

# Generate 50 files for each sentence with different pitch and speaking rate values
for sentence in sentences:
    for voice in voices:
        for i in range(4):
            pitch = -2.0 + (i * 0.3)
            rate = 0.8 + (i * 0.03)
            generate_audio(sentence, voice, pitch, rate, i)

print("Dataset generation complete.")

Generated: tts_output/kucing_menangis/id-ID-Standard-A_pitch-2.0_rate0.8_0.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-A_pitch-1.7_rate0.8300000000000001_1.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-A_pitch-1.4_rate0.8600000000000001_2.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-A_pitch-1.1_rate0.89_3.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-B_pitch-2.0_rate0.8_0.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-B_pitch-1.7_rate0.8300000000000001_1.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-B_pitch-1.4_rate0.8600000000000001_2.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-B_pitch-1.1_rate0.89_3.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-C_pitch-2.0_rate0.8_0.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-C_pitch-1.7_rate0.8300000000000001_1.wav
Generated: tts_output/kucing_menangis/id-ID-Standard-C_pitch-1.4_rate0.8600000000000001_2.wav
Generated: tts_output/kucing_menangis/i

In [1]:
from pydub import AudioSegment
from pydub.generators import WhiteNoise
import os
import random

In [2]:
def normalize_audio(audio, target_dBFS=-20.0, target_sample_rate=8000):
    change_in_dBFS = target_dBFS - audio.dBFS
    return audio.apply_gain(change_in_dBFS).set_frame_rate(target_sample_rate)

def add_white_noise(audio, noise_level=0.005):
    noise = WhiteNoise().to_audio_segment(duration=len(audio))
    noise = noise - (noise.dBFS - audio.dBFS) + 2  # Adjust noise to desired level
    return audio.overlay(noise - noise_level)

In [3]:
# Define input and output directories
base_output_dir = "tts_output"
normalized_output_dir = "normalized"
normalized_augmented_output_dir = "normalized_augmented"
fused_output_dir = "fused"

os.makedirs(normalized_output_dir, exist_ok=True)
os.makedirs(normalized_augmented_output_dir, exist_ok=True)
os.makedirs(fused_output_dir, exist_ok=True)

for sentence_folder in os.listdir(base_output_dir):
    sentence_folder_path = os.path.join(base_output_dir, sentence_folder)
    if os.path.isdir(sentence_folder_path):
        # Create corresponding folders in the output directories
        normalized_sentence_folder_path = os.path.join(normalized_output_dir, sentence_folder)
        os.makedirs(normalized_sentence_folder_path, exist_ok=True)

        augmented_sentence_folder_path = os.path.join(normalized_augmented_output_dir, sentence_folder)
        os.makedirs(augmented_sentence_folder_path, exist_ok=True)

        fused_sentence_folder_path = os.path.join(fused_output_dir, sentence_folder)
        os.makedirs(fused_sentence_folder_path, exist_ok=True)

        # Collect all audio files in the current sentence folder
        audio_files = [f for f in os.listdir(sentence_folder_path) if f.endswith(".wav")]

        # Normalize each audio file and save to 'normalized' folder
        for filename in audio_files:
            audio_path = os.path.join(sentence_folder_path, filename)
            original_audio = AudioSegment.from_file(audio_path)

            # Normalize audio
            normalized_audio = normalize_audio(original_audio)

            # Save normalized audio to 'normalized' folder
            normalized_output_path = os.path.join(normalized_sentence_folder_path, filename)
            normalized_audio.export(normalized_output_path, format="wav")

print("Normalization complete.")

Normalization complete.


### ML Model Process for Level 2


#### Import Library

In [14]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, Flatten, Dense, Lambda, Dropout, MaxPooling1D
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
import librosa
import pydub

#### Audio Data Processing Functions

In [1]:
# Function to load and preprocess audio files
def load_and_preprocess(file_path, target_length=16000):
    audio, _ = librosa.load(file_path, sr=16000, mono=True)

    # Ensure audio length is not greater than target_length
    if len(audio) > target_length:
        audio = audio[:target_length]
    else:
        # Pad audio to target_length if shorter
        pad_amount = target_length - len(audio)
        audio = np.pad(audio, (0, pad_amount), mode='constant')

    # Normalize audio
    audio = audio / np.max(np.abs(audio))

    # Reshape audio to include time steps dimension
    audio = np.expand_dims(audio, axis=-1)

    return audio

In [2]:
def convert_and_normalize(input_path, output_path, target_dBFS=-20.0, target_sample_rate=8000):
    # Load audio using pydub (support for various formats including mp3, wav etc)
    audio = pydub.AudioSegment.from_file(input_path)

    # Normalize audio to target dBFS and sample rate
    change_in_dBFS = target_dBFS - audio.dBFS
    normalized_audio = audio.apply_gain(change_in_dBFS)
    normalized_audio = normalized_audio.set_frame_rate(target_sample_rate)

    # Export normalized audio to WAV format
    normalized_audio.export(output_path, format="wav")

In [3]:
# Function to load augmented dataset
def load_data(base_dir):
    sentences = []
    file_paths = []

    for sentence in os.listdir(base_dir):
        sentence_dir = os.path.join(base_dir, sentence)
        for file in os.listdir(sentence_dir):
            if file.endswith(".wav"):
                file_paths.append(os.path.join(sentence_dir, file))
                sentences.append(sentence)

    return np.array(file_paths), np.array(sentences)

In [4]:
# Function to create pairs of audio samples with their labels
def create_pairs(files, sentences):
    pairs = []
    labels = []
    num_samples = len(files)

    for i in range(num_samples):
        for j in range(i+1, num_samples):
            if sentences[i] == sentences[j]:
                pairs.append((i, j))
                labels.append(1)
            else:
                pairs.append((i, j))
                labels.append(0)

    return np.array(pairs, dtype=np.int32), np.array(labels, dtype=np.int32)

#### Data Generator for Training

In [5]:
def data_generator(files, sentences, batch_size=32, target_length=16000):
    while True:
        indices = np.random.permutation(len(files))
        pairs, labels = create_pairs(files, sentences)
        batch_start = 0
        while batch_start < len(pairs):
            batch_end = min(batch_start + batch_size, len(pairs))
            batch_indices = indices[batch_start:batch_end]
            batch_pairs = pairs[batch_indices]
            batch_labels = labels[batch_indices]

            audio_1 = np.array([load_and_preprocess(files[i], target_length) for i in batch_pairs[:, 0]])
            audio_2 = np.array([load_and_preprocess(files[i], target_length) for i in batch_pairs[:, 1]])

            # Check if batch_pairs is empty
            if len(batch_pairs) == 0:
                break

            # Yield batches
            yield [audio_1, audio_2], batch_labels
            batch_start += batch_size


#### Define Siamese CNN Model

In [24]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Dropout, Flatten, Dense, Lambda, BatchNormalization, LeakyReLU
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

def create_siamese_model(input_shape):
    def cnn_network(input_shape):
        model = tf.keras.Sequential()

        # First Convolutional Layer
        model.add(Conv1D(64, 7, padding='same', input_shape=input_shape))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.3))

        # Second Convolutional Layer
        model.add(Conv1D(128, 5, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.3))

        # Third Convolutional Layer
        model.add(Conv1D(128, 5, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.3))

        # Third Convolutional Layer
        model.add(Conv1D(128, 3, padding='same'))
        model.add(BatchNormalization())
        model.add(LeakyReLU(alpha=0.1))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.3))

        # Flatten and Dense Layers
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.3))

        return model

    input_left = Input(shape=input_shape)
    input_right = Input(shape=input_shape)

    cnn = cnn_network(input_shape)

    encoded_left = cnn(input_left)
    encoded_right = cnn(input_right)

    # L1 distance layer between the two encoded outputs
    L1_distance = Lambda(lambda x: K.abs(x[0] - x[1]))
    L1_distance_out = L1_distance([encoded_left, encoded_right])

    # Prediction layer
    prediction = Dense(1, activation='sigmoid')(L1_distance_out)

    # Model instance
    siamese_model = Model(inputs=[input_left, input_right], outputs=prediction)

    return siamese_model

# Example usage
input_shape = (128, 1)  # Example input shape
siamese_model = create_siamese_model(input_shape)
siamese_model.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_9 (InputLayer)        [(None, 128, 1)]             0         []                            
                                                                                                  
 input_10 (InputLayer)       [(None, 128, 1)]             0         []                            
                                                                                                  
 sequential_4 (Sequential)   (None, 256)                  502912    ['input_9[0][0]',             
                                                                     'input_10[0][0]']            
                                                                                                  
 lambda_4 (Lambda)           (None, 256)                  0         ['sequential_4[0][0]',  

In [23]:
# Function to create the Siamese CNN model
def create_siamese_model(input_shape):
    def cnn_network(input_shape):
        model = tf.keras.Sequential()
        model.add(Conv1D(64, 5, activation='relu', input_shape=input_shape))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.2))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(0.2))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.2))
        return model

    input_left = Input(shape=input_shape)
    input_right = Input(shape=input_shape)

    cnn = cnn_network(input_shape)

    encoded_left = cnn(input_left)
    encoded_right = cnn(input_right)

    # L1 distance layer between the two encoded outputs
    L1_distance = Lambda(lambda x: K.abs(x[0] - x[1]))
    L1_distance_out = L1_distance([encoded_left, encoded_right])

    # Prediction layer
    prediction = Dense(1, activation='sigmoid')(L1_distance_out)

    # Model instance
    siamese_model = Model(inputs=[input_left, input_right], outputs=prediction)

    return siamese_model

#### Load and Split Data

In [25]:
# Directory containing augmented TTS output
output_dir = "/content/tts_output"

# Load augmented dataset
file_paths, sentences = load_data(output_dir)

# Split data into training and validation sets
train_files, val_files, train_sentences, val_sentences = train_test_split(file_paths, sentences, test_size=0.2, random_state=42)

#### Create Generators and Compile Model

In [26]:
# Create generators
train_gen = data_generator(train_files, train_sentences, batch_size=16)
val_gen = data_generator(val_files, val_sentences, batch_size=16)

In [27]:
# Create generators
train_gen = data_generator(train_files, train_sentences, batch_size=32)
val_gen = data_generator(val_files, val_sentences, batch_size=32)

# Define input shape
input_shape = (16000, 1)

# Create the model
siamese_model = create_siamese_model(input_shape)

# Compile the model
siamese_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])
siamese_model.summary()

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_11 (InputLayer)       [(None, 16000, 1)]           0         []                            
                                                                                                  
 input_12 (InputLayer)       [(None, 16000, 1)]           0         []                            
                                                                                                  
 sequential_5 (Sequential)   (None, 256)                  3300876   ['input_11[0][0]',            
                                                          8          'input_12[0][0]']            
                                                                                                  
 lambda_5 (Lambda)           (None, 256)                  0         ['sequential_5[0][0]',  

#### Train the Model

In [28]:
# Callbacks for training
checkpoint = ModelCheckpoint('siamese_model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)

In [29]:
# Assuming you have variables `num_train_samples` and `num_val_samples` representing
# the total number of training and validation samples, and `batch_size` representing
# the size of each batch

steps_per_epoch = len(train_files) // 8
validation_steps = len(val_files) // 8

# Check if the calculated steps_per_epoch and validation_steps are greater than 0
if steps_per_epoch == 0 or validation_steps == 0:
    raise ValueError("Number of steps per epoch or validation steps is zero. Please check the dataset size and batch size.")

# Fit the model
history = siamese_model.fit(
    train_gen,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_gen,
    validation_steps=validation_steps,
    epochs=20,
    callbacks=[checkpoint, early_stopping],
    verbose=1
)


Epoch 1/20
Epoch 1: val_loss improved from inf to 0.69230, saving model to siamese_model.h5
Epoch 2/20
Epoch 2: val_loss did not improve from 0.69230
Epoch 3/20
Epoch 3: val_loss improved from 0.69230 to 0.69203, saving model to siamese_model.h5
Epoch 4/20
Epoch 4: val_loss did not improve from 0.69203
Epoch 5/20
Epoch 5: val_loss improved from 0.69203 to 0.69110, saving model to siamese_model.h5
Epoch 6/20
Epoch 6: val_loss did not improve from 0.69110
Epoch 7/20
Epoch 7: val_loss did not improve from 0.69110
Epoch 8/20
Epoch 8: val_loss did not improve from 0.69110
Epoch 9/20
Epoch 9: val_loss did not improve from 0.69110
Epoch 10/20
Epoch 10: val_loss did not improve from 0.69110
Restoring model weights from the end of the best epoch: 5.
Epoch 10: early stopping


In [None]:
siamese_model.save('siamese_model.h5')

  saving_api.save_model(


#### Test and Predict

In [30]:
# Paths to the test audio files (m4a format)
#test_audio_file_1 = 'test/ular-clear-1.m4a'
#test_audio_file_2 = 'test/mobil-clear-1.m4a'

# Paths to the converted WAV files
#converted_audio_file_1 = 'converted_test_audio_1.wav'
#converted_audio_file_2 = 'converted_test_audio_2.wav'

# Convert the test audio files to WAV format with 8kHz sample rate
#convert_and_normalize(test_audio_file_1, converted_audio_file_1)
#convert_and_normalize(test_audio_file_2, converted_audio_file_2)

converted_audio_file_1 = '/content/drive/MyDrive/SUARA BANGKIT/IBU MELEMPAR BATU.wav'
converted_audio_file_2 = '/content/drive/MyDrive/SUARA BANGKIT/IBU MELEMPAR BARU 3.wav'

# Load and preprocess the test audio files
test_audio_1 = load_and_preprocess(converted_audio_file_1,target_length=16000)
test_audio_2 = load_and_preprocess(converted_audio_file_2,target_length=16000)

# Add batch dimension
test_audio_1 = np.expand_dims(test_audio_1, axis=0)
test_audio_2 = np.expand_dims(test_audio_2, axis=0)

In [31]:
# Load the trained Siamese model
#siamese_model = tf.keras.models.load_model('/content/siamese_model.h5', compile=False)

# Predict the similarity
similarity_score = siamese_model.predict([test_audio_1, test_audio_2])

# Define the min and max scores for normalization
min_score = 0.0
max_score = 0.5

# Normalize the similarity score to the range 0-100%
normalized_similarity_score = (similarity_score[0][0] - min_score) / (max_score - min_score) * 100

# Clip the value to ensure it stays within the 0-100% range
normalized_similarity_score = np.clip(normalized_similarity_score, 0, 100)

# Output the normalized similarity score
print(f'Similarity score: {normalized_similarity_score:.2f}%')


Similarity score: 99.03%


In [32]:
#Muat model Keras dari file .h5
siamese_model = tf.keras.models.load_model('siamese_model.h5')

# Buat konverter TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(siamese_model)

# Mengatur optimasi ke kuantisasi float16
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]

# Konversi model
tflite_model = converter.convert()

# Simpan model TFLite ke file
with open('model_quant_float16.tflite', 'wb') as f:
    f.write(tflite_model)

print("Model berhasil dikonversi dan disimpan sebagai model_quant_float16.tflite")

Model berhasil dikonversi dan disimpan sebagai model_quant_float16.tflite
