In [7]:
!pip install gtts
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import soundfile as sf
import os
import zipfile
import shutil
import tensorflow as tf
import librosa
import numpy as np
from google.colab import drive
from gtts import gTTS
from pydub import AudioSegment

# Mount Google Drive
drive.mount('/content/drive')

# Set the path to the VoxCeleb-1 dataset ZIP file in Google Drive
zip_file_path = '/content/drive/MyDrive/Colab Notebooks/VoxCeleb-1 dataset.zip'

# Extract the dataset ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('voxceleb-1-dataset')

# Preprocess audio files
audio_files = []
for root, _, files in os.walk('voxceleb-1-dataset'):
    for file in files:
        if file.endswith('.wav'):
            audio_file = os.path.join(root, file)
            audio, sr = librosa.load(audio_file, sr=None)
            audio = librosa.amplitude_to_db(audio)
            audio = audio / np.max(audio)
            target_length = 8000
            if len(audio) < target_length:
                padding = np.zeros(target_length - len(audio))
                audio = np.concatenate((audio, padding))
            elif len(audio) > target_length:
                audio = audio[:target_length]
            audio = audio.reshape((1, 8000))
            audio_files.append(audio)
print(f"Number of audio files: {len(audio_files)}")

# Train the model
audio_data = np.concatenate(audio_files, axis=0)  # Concatenate audio files into a single array
target_data = np.zeros_like(audio_data)  # Create a dummy target array
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(8000,)),
    tf.keras.layers.Reshape((8000, 1)),  # Add Reshape layer for compatibility with LSTM
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(1, activation='linear')
])
model.compile(loss='mse', optimizer='adam')
model.fit(audio_data, target_data, epochs=5)

# Clean up the extracted files
shutil.rmtree('voxceleb-1-dataset')

# Generate speech from text
text = 'This is a test of the voice cloning model.'
tts = gTTS(text=text, lang='en')
tts.save('/content/generated_speech.mp3')

speech, _ = librosa.load('/content/generated_speech.mp3', sr=8000)
speech = librosa.amplitude_to_db(speech)
speech = speech / np.max(speech)
if len(speech) < target_length:
    padding = np.zeros(target_length - len(speech))
    speech = np.concatenate((speech, padding))
elif len(speech) > target_length:
    speech = speech[:target_length]
speech = speech.reshape((1, 8000))
generated_audio = model.predict(speech)
generated_audio = np.power(generated_audio, 10)
generated_audio = librosa.db_to_amplitude(generated_audio)
generated_audio = np.nan_to_num(generated_audio, nan=0.0, posinf=0.0, neginf=0.0)
generated_audio = librosa.resample(generated_audio, orig_sr=8000, target_sr=16000)

# Save the generated speech audio
generated_audio = np.int16(generated_audio * (32767 / np.max(np.abs(generated_audio))))
audio_segment = AudioSegment(generated_audio.tobytes(), frame_rate=16000, sample_width=2, channels=1)
audio_segment.export('/content/generated_speech_output.mp3', format='mp3')
sf.write('/content/generated_speech.wav', generated_audio, 16000, format='WAV')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  audio = audio / np.max(audio)
  audio = audio / np.max(audio)


Number of audio files: 4857
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  generated_audio = np.int16(generated_audio * (32767 / np.max(np.abs(generated_audio))))
  generated_audio = np.int16(generated_audio * (32767 / np.max(np.abs(generated_audio))))
