In [2]:
pip install pyworld

Collecting pyworld
  Downloading pyworld-0.3.4.tar.gz (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.0/252.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pyworld
  Building wheel for pyworld (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyworld: filename=pyworld-0.3.4-cp310-cp310-linux_x86_64.whl size=862465 sha256=542ecbc2decec59381dd3a31126424e5a38faad8a7a4c3f1a13c9b113a6cb6d6
  Stored in directory: /root/.cache/pip/wheels/66/09/8a/a1d79b73d59756f66e9bfe55a199840efc7473adb76ddacdfd
Successfully built pyworld
Installing collected packages: pyworld
Successfully installed pyworld-0.3.4


In [3]:
import os
import librosa
import numpy as np
import pyworld as world
import soundfile as sf
import matplotlib.pyplot as plt
import librosa.display
import tensorflow as tf


def extract_features_for_comparison(audio_path):
    y, sr = librosa.load(audio_path.numpy().decode('utf-8') if isinstance(audio_path, tf.Tensor) else audio_path, sr=None)
    f0, _ = librosa.piptrack(y=y, sr=sr)
    f0 = f0[f0 > 0]
    energy = librosa.feature.rms(y=y)[0]
    return y, sr, f0, energy

In [4]:
def compare_features(human_features, tts_features):
    human_y, human_sr, human_f0, human_energy = human_features
    tts_y, tts_sr, tts_f0, tts_energy = tts_features

    human_f0 = human_f0.numpy() if isinstance(human_f0, tf.Tensor) else human_f0
    tts_f0 = tts_f0.numpy() if isinstance(tts_f0, tf.Tensor) else tts_f0
    human_energy = human_energy.numpy() if isinstance(human_energy, tf.Tensor) else human_energy
    tts_energy = tts_energy.numpy() if isinstance(tts_energy, tf.Tensor) else tts_energy

    pitch_diff = np.mean(human_f0[human_f0 > 0]) - np.mean(tts_f0[tts_f0 > 0])

    # Calculate duration ratio
    duration_ratio = len(human_y) / len(tts_y)

    # Calculate energy ratio
    energy_ratio = np.mean(human_energy) / np.mean(tts_energy)

    return np.float32(pitch_diff), np.float32(duration_ratio), np.float32(energy_ratio)


In [5]:

def extract_features(audio_path):
    if isinstance(audio_path, tf.Tensor):
        audio_path = audio_path.numpy().decode('utf-8')

    # Load audio with dtype='float64' to match PyWorld's expectation
    y, sr = librosa.load(audio_path, sr=None, dtype='float64')

    if len(y) < sr * 0.1:
        y = np.pad(y, (0, sr * 0.1 - len(y)), 'constant')

    frame_period = min(5.0, 1000 * len(y) / sr / 100)
    _f0, t = world.dio(y, sr, frame_period=frame_period)
    f0 = world.stonemask(y, _f0, t, sr)
    sp = world.cheaptrick(y, f0, t, sr)
    ap = world.d4c(y, f0, t, sr)

    return y, sr, f0.astype(np.float32), sp.astype(np.float32), ap.astype(np.float32)

In [6]:
def extract_comparison_features(human_audio, tts_audio):
    def process_audio(audio):
        y, sr, f0, energy = tf.py_function(extract_features_for_comparison, [audio], [tf.float32, tf.int32, tf.float32, tf.float32])
        return y, sr, f0, energy

    human_y, human_sr, human_f0, human_energy = process_audio(human_audio)
    tts_y, tts_sr, tts_f0, tts_energy = process_audio(tts_audio)

    def compare(human_features, tts_features):
        pitch_diff, duration_ratio, energy_ratio = compare_features(human_features, tts_features)
        return pitch_diff, duration_ratio, energy_ratio

    pitch_diff, duration_ratio, energy_ratio = tf.py_function(
        compare,
        [human_y, human_sr, human_f0, human_energy, tts_y, tts_sr, tts_f0, tts_energy],  # Unpack tuples here
        [tf.float32, tf.float32, tf.float32]
    )

    return tf.stack([pitch_diff, tf.math.log(duration_ratio), tf.math.log(energy_ratio)])

In [7]:
@tf.function
def apply_manipulations(tts_audio, parameters):
    y, sr, f0, sp, ap = extract_features(tts_audio)
    pitch_shift = parameters[0]
    duration_factor = parameters[1]
    energy_factor = parameters[2]

    f0_modified, sp_modified, ap_modified = manipulate_features(
        f0, sp, ap,
        pitch_shift=pitch_shift,
        duration_factor=tf.exp(duration_factor),
        energy_factor=tf.exp(energy_factor)
    )

    y_synthesized = tf.py_function(synthesize_speech, [f0_modified, sp_modified, ap_modified, sr], tf.float32)
    return y_synthesized, sr


In [8]:

def calculate_similarity(human_audio, modified_tts_audio):
    human_audio_path = human_audio.numpy().decode('utf-8') if isinstance(human_audio, tf.Tensor) else human_audio
    human_y, human_sr = librosa.load(human_audio_path, sr=None)

    def extract_data(tensor):
        return tensor.numpy()

    modified_tts_y, modified_tts_sr = tf.py_function(extract_data, [modified_tts_audio[0], modified_tts_audio[1]], [tf.float32, tf.int32])

    human_y = tf.cond(
        tf.equal(human_sr, modified_tts_sr),
        lambda: human_y,
        lambda: librosa.resample(y=human_y, orig_sr=human_sr, target_sr=modified_tts_sr.numpy())
    )

    min_len = min(len(human_y), len(modified_tts_y))
    human_y = human_y[:min_len]
    modified_tts_y = modified_tts_y[:min_len]


    human_mfcc = tf.signal.mfccs_from_log_mel_spectrograms(
        tf.signal.log_mel_spectrogram(
            tf.cast(human_y, tf.float32),
            sample_rate=modified_tts_sr,
        )
    )

    modified_tts_mfcc = tf.signal.mfccs_from_log_mel_spectrograms(
        tf.signal.log_mel_spectrogram(
            tf.cast(modified_tts_y, tf.float32),
            sample_rate=modified_tts_sr,
        )
    )

    mfcc_similarity = tf.reduce_mean(tf.square(human_mfcc - modified_tts_mfcc))

    return -mfcc_similarity # Negative for maximizing similarity during training

In [9]:

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(3,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(3)
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [10]:
def modify_pitch_preserve_contour(f0, shift):
    f0 = tf.cast(f0, dtype=tf.float32)
    f0_log = tf.math.log(f0)
    f0_mean = tf.reduce_mean(tf.boolean_mask(f0_log, f0 > 0))
    f0_std = tf.math.reduce_std(tf.boolean_mask(f0_log, f0 > 0))
    f0_normalized = (f0_log - f0_mean) / f0_std
    shift = tf.cast(shift, dtype=tf.float32)
    f0_shifted = tf.math.exp(f0_normalized * f0_std + f0_mean + shift)
    return f0_shifted

In [11]:
@tf.function
def modify_duration(feature, duration_factor):
    feature = tf.convert_to_tensor(feature, dtype=tf.float32)
    duration_factor = tf.cast(duration_factor, dtype=tf.float32)

    feature = tf.reshape(feature, [-1])

    original_length = tf.shape(feature)[0]
    new_length = tf.cast(tf.cast(original_length, tf.float32) / duration_factor, tf.int32)

    def numpy_interp(x, xp, fp):
        return np.interp(x, xp, fp)

    original_indices = tf.range(0, original_length, dtype=tf.float32)
    new_indices = tf.range(0, new_length, dtype=tf.float32) * (tf.cast(original_length, tf.float32) - 1) / (tf.cast(new_length, tf.float32) - 1)

    resized_feature = tf.numpy_function(numpy_interp, [new_indices, original_indices, feature], tf.float32)
    resized_feature.set_shape([None])
    return resized_feature


In [12]:
def modify_energy(sp, energy_factor):
    sp = tf.cast(sp, dtype=tf.float32)
    energy_factor = tf.cast(energy_factor, dtype=tf.float32)
    return sp * energy_factor

In [13]:
@tf.function
def manipulate_features(f0, sp, ap, pitch_shift=0.0, duration_factor=1.0, energy_factor=1.0):
    f0 = tf.convert_to_tensor(f0, dtype=tf.float32)
    sp = tf.convert_to_tensor(sp, dtype=tf.float32)
    ap = tf.convert_to_tensor(ap, dtype=tf.float32)
    pitch_shift = tf.cast(pitch_shift, dtype=tf.float32)
    duration_factor = tf.cast(duration_factor, dtype=tf.float32)
    energy_factor = tf.cast(energy_factor, dtype=tf.float32)

    f0_modified = modify_pitch_preserve_contour(f0, pitch_shift)
    f0_modified = modify_duration(f0_modified, duration_factor)
    sp_modified = modify_energy(sp, energy_factor)
    sp_modified = modify_duration(sp_modified, duration_factor)
    ap_modified = modify_duration(ap, duration_factor)
    return f0_modified, sp_modified, ap_modified

In [14]:
@tf.function
def train_step(human_audio, tts_audio):
    with tf.GradientTape() as tape:
        features = extract_comparison_features(human_audio, tts_audio)
        parameters = model(features[tf.newaxis, ...])
        modified_tts = apply_manipulations(tts_audio, parameters[0])
        loss = calculate_similarity(human_audio, modified_tts)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss

def train_model(human_folder, tts_folder, num_epochs=10):
    audio_pairs = []
    for filename in os.listdir(human_folder):
        if filename.endswith(".wav"):
            human_file = os.path.join(human_folder, filename)
            tts_file = os.path.join(tts_folder, filename)
            if os.path.exists(tts_file):
                audio_pairs.append((human_file, tts_file))

    for epoch in range(num_epochs):
        total_loss = 0
        for human_audio, tts_audio in audio_pairs:
            loss = train_step(human_audio, tts_audio)
            total_loss += loss
        print(f"Epoch {epoch + 1}, Average Loss: {total_loss / len(audio_pairs)}")

In [15]:
def process_file_with_ml(human_input_file, tts_input_file, output_prefix):
    features = extract_comparison_features(tf.constant(human_input_file), tf.constant(tts_input_file))
    parameters = model(features[tf.newaxis, ...])

    y, sr, f0, sp, ap = extract_features(tts_input_file)

    f0_modified, sp_modified, ap_modified = manipulate_features(
        f0, sp, ap,
        pitch_shift=parameters[0, 0],
        duration_factor=tf.exp(parameters[0, 1]),
        energy_factor=tf.exp(parameters[0, 2])
    )

    y_synthesized = synthesize_speech(f0_modified.numpy(), sp_modified.numpy(), ap_modified.numpy(), sr)

    sf.write(f'{output_prefix}_synthesized.wav', y_synthesized, sr)

    plot_and_save_features(f0, f0_modified.numpy(), sp, sp_modified.numpy(), output_prefix)

    print(f"Processed file saved as {output_prefix}_synthesized.wav")
    print(f"Feature comparisons saved as {output_prefix}_f0_comparison.png and {output_prefix}_spectral_envelope_comparison.png")
    print(f"Original duration: {len(y)/sr:.2f}s, Modified duration: {len(y_synthesized)/sr:.2f}s")

In [16]:
def process_all_files_with_ml(human_folder, tts_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(human_folder):
        if filename.endswith(".wav"):
            human_file = os.path.join(human_folder, filename)
            tts_file = os.path.join(tts_folder, filename)

            if os.path.exists(tts_file):
                output_prefix = os.path.join(output_folder, os.path.splitext(filename)[0])
                print(f"Processing: {filename}")
                process_file_with_ml(human_file, tts_file, output_prefix)
            else:
                print(f"TTS file not found for {filename}")

In [None]:
def main():
    ita_human_folder = '/content/drive/MyDrive/data (1)/data/wav/ITA/train'
    ita_tts_folder = '/content/drive/MyDrive/Audio_Files/ITA_Train_TTS_Audios'
    ita_output_folder = '/content/drive/MyDrive/Processed_Audios/ITA'

    ger_human_folder = '/content/drive/MyDrive/data (1)/data/wav/GER/train'
    ger_tts_folder = '/content/drive/MyDrive/Audio_Files/GER_Train_TTS_Audios'
    ger_output_folder = '/content/drive/MyDrive/Processed_Audios/GER'

    os.makedirs(ita_output_folder, exist_ok=True)
    os.makedirs(ger_output_folder, exist_ok=True)

    print("Training model on Italian files...")
    train_model(ita_human_folder, ita_tts_folder)

    print("Processing Italian files...")
    process_all_files_with_ml(ita_human_folder, ita_tts_folder, ita_output_folder)

    print("Training model on German files...")
    train_model(ger_human_folder, ger_tts_folder)

    print("Processing German files...")
    process_all_files_with_ml(ger_human_folder, ger_tts_folder, ger_output_folder)

if __name__ == "__main__":
    main()
