# **1. Cleanup the input audio file**

In [2]:
!pip install ipywidgets
!pip install noisereduce
!pip install librosa
!pip install soundfile

from google.colab import files
import librosa
import soundfile as sf
import noisereduce as nr
import numpy as np
import os
import ipywidgets as widgets
from IPython.display import display

def create_next_folder():
    # Find the next available folder number
    folder_num = 1
    while os.path.exists(f"/content/{folder_num}"):
        folder_num += 1

    # Create the folder
    folder_path = f"/content/{folder_num}"
    os.makedirs(folder_path)
    return folder_num, folder_path

def clean_audio(input_path, output_path, apply_noise_reduction=True):
    # Load the audio file
    audio_data, sample_rate = librosa.load(input_path, sr=None)

    if apply_noise_reduction:
        # Perform noise reduction
        reduced_noise = nr.reduce_noise(
            y=audio_data,
            sr=sample_rate,
            stationary=True,
            prop_decrease=1.0
        )
        # Save the cleaned audio
        sf.write(output_path, reduced_noise, sample_rate)
        return "Noise reduction completed!"
    else:
        # Just copy the file without noise reduction
        sf.write(output_path, audio_data, sample_rate)
        return "File copied without noise reduction."

# Create a new numbered folder
folder_num, folder_path = create_next_folder()

# Upload file
print(f"Uploading file to folder {folder_num}...")
uploaded = files.upload()
input_filename = list(uploaded.keys())[0]  # Gets the name of uploaded file

# Get file extension
file_extension = os.path.splitext(input_filename)[1]

# Define file paths
original_file_path = f"/content/{input_filename}"
renamed_file_path = f"{folder_path}/{folder_num}{file_extension}"
cleaned_file_path = f"{folder_path}/{folder_num}_cleaned{file_extension}"

# Move and rename the uploaded file
os.rename(original_file_path, renamed_file_path)
print(f"File saved as {renamed_file_path}")

# Create a checkbox for noise reduction option
noise_reduction_checkbox = widgets.Checkbox(
    value=False,
    description='Apply Noise Reduction',
    disabled=False
)
display(noise_reduction_checkbox)

# Create a process button
process_button = widgets.Button(
    description='Process Audio',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to process the audio file',
    icon='check'
)

def on_process_button_clicked(b):
    apply_noise_reduction = noise_reduction_checkbox.value
    result = clean_audio(renamed_file_path, cleaned_file_path, apply_noise_reduction)

    if apply_noise_reduction:
        print(f"{result} Cleaned file saved as {cleaned_file_path}")
    else:
        print(f"{result} File saved as {cleaned_file_path}")

    # Create download button for the processed file
    files.download(cleaned_file_path)

process_button.on_click(on_process_button_clicked)
display(process_button)

Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.3
Uploading file to folder 1...


Saving 1.wav to 1.wav
File saved as /content/1/1.wav


Checkbox(value=False, description='Apply Noise Reduction')

Button(description='Process Audio', icon='check', style=ButtonStyle(), tooltip='Click to process the audio fil…

File copied without noise reduction. File saved as /content/1/1_cleaned.wav


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Download option for the cleaned file
files.download(cleaned_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **2. Use whisper model for STT**

In [3]:
!pip install openai-whisper
print("done")

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m460.8/800.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127

In [4]:
import os
import torch
import gc
import warnings
from google.colab import files

# Suppress warnings to reduce noise
warnings.filterwarnings('ignore')

# Force CPU usage to avoid CUDA issues
os.environ["CUDA_VISIBLE_DEVICES"] = ""
device = "cpu"
print("Forcing CPU usage to avoid CUDA-related crashes")

# First check if we need to install whisper
try:
    import whisper
except ImportError:
    print("Installing whisper...")
    !pip install -q git+https://github.com/openai/whisper.git
    import whisper

# Check for ipywidgets
try:
    import ipywidgets as widgets
    from IPython.display import display
except ImportError:
    print("Installing ipywidgets...")
    !pip install -q ipywidgets
    import ipywidgets as widgets
    from IPython.display import display

# Force garbage collection
gc.collect()

print(f"Using device: {device}")

def get_folder_list():
    """Get a list of numbered folders in the content directory"""
    folders = []
    for item in os.listdir("/content"):
        if os.path.isdir(f"/content/{item}") and item.isdigit():
            folders.append(item)
    return sorted(folders, key=int)

def transcribe_folder(folder_number):
    """Transcribe the cleaned audio file from the specified folder"""
    folder_path = f"/content/{folder_number}"

    # Find the cleaned audio file in the folder
    cleaned_file = None
    for file in os.listdir(folder_path):
        if file.startswith(f"{folder_number}_cleaned"):
            cleaned_file = file
            break

    if not cleaned_file:
        return f"No cleaned audio file found in folder {folder_number}"

    audio_path = f"{folder_path}/{cleaned_file}"
    output_path = f"{folder_path}/{folder_number}_transcription.txt"

    try:
        # Force garbage collection before loading model
        gc.collect()

        # Load model for each transcription to ensure clean state
        print("Loading Whisper base model (this may take a moment)...")
        model = whisper.load_model("base")

        # Process audio
        print(f"Transcribing audio file: {audio_path}")

        # Use lowest memory settings
        result = model.transcribe(
            audio_path,
            fp16=False,
            language='en',
            verbose=False,
            temperature=0,
            beam_size=1
        )

        # Get the transcribed text
        transcribed_text = result["text"]

        # Print the transcription
        print("\nTranscribed Text:")
        print(transcribed_text)

        # Save the transcription
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(transcribed_text)
        print(f"\nTranscription saved to {output_path}")

        # Clean up
        del model
        gc.collect()

        return f"Transcription completed for folder {folder_number}"

    except Exception as e:
        # Clean up on error
        gc.collect()
        return f"An error occurred: {str(e)}"

# Simple function to run transcription
def run_transcription(folder_number=None):
    available_folders = get_folder_list()

    if not available_folders:
        print("No numbered folders found. Please run the noise reduction code first.")
        return

    if folder_number is None:
        # Print available folders
        print("Available folders:")
        for folder in available_folders:
            print(f"- {folder}")
        folder_number = input("Enter folder number to transcribe: ")

    if folder_number in available_folders:
        print(f"Transcribing audio from folder {folder_number}...")
        result = transcribe_folder(folder_number)
        print(result)
    else:
        print(f"Folder {folder_number} not found.")

# Get available folders
available_folders = get_folder_list()

if not available_folders:
    print("No numbered folders found. Please run the noise reduction code first.")
else:
    # Create a simpler interface
    print("Available folders for transcription:")
    for folder in available_folders:
        print(f"- {folder}")

    # Use a simple text-based interface
    folder_number = input("Enter folder number to transcribe (or 'all' for all folders): ")

    if folder_number.lower() == 'all':
        for folder in available_folders:
            print(f"\nProcessing folder {folder}...")
            result = transcribe_folder(folder)
            print(result)
            # Force garbage collection between folders
            gc.collect()
    elif folder_number in available_folders:
        run_transcription(folder_number)
    else:
        print(f"Folder {folder_number} not found.")

# Final cleanup
gc.collect()
print("Transcription process completed.")


Forcing CPU usage to avoid CUDA-related crashes
Using device: cpu
Available folders for transcription:
- 1
Enter folder number to transcribe (or 'all' for all folders): 1
Transcribing audio from folder 1...
Loading Whisper base model (this may take a moment)...


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 149MiB/s]


Transcribing audio file: /content/1/1_cleaned.wav


100%|██████████| 566/566 [00:03<00:00, 172.68frames/s]



Transcribed Text:
 Please call Stella, ask her to bring these things with her from the store. Six spoons of fresh snow peas.

Transcription saved to /content/1/1_transcription.txt
Transcription completed for folder 1
Transcription process completed.


In [None]:
!pip uninstall -y whisper
!pip uninstall -y torch torchvision torchaudio

[0mFound existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124


In [None]:
# Offer download option
files.download(output_path)

# **3. Voice cloning and TTS**

In [5]:
!pip install f5-tts
print("done")

Collecting f5-tts
  Downloading f5_tts-1.0.8-py3-none-any.whl.metadata (10 kB)
Collecting bitsandbytes>0.37.0 (from f5-tts)
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting cached_path (from f5-tts)
  Downloading cached_path-1.7.1-py3-none-any.whl.metadata (19 kB)
Collecting datasets (from f5-tts)
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting ema_pytorch>=0.5.2 (from f5-tts)
  Downloading ema_pytorch-0.7.7-py3-none-any.whl.metadata (689 bytes)
Collecting gradio>=3.45.2 (from f5-tts)
  Downloading gradio-5.23.0-py3-none-any.whl.metadata (16 kB)
Collecting hydra-core>=1.3.0 (from f5-tts)
  Downloading hydra_core-1.3.2-py3-none-any.whl.metadata (5.5 kB)
Collecting numpy<=1.26.4 (from f5-tts)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[

done


In [1]:
import os
import torch
import subprocess
from IPython.display import Audio, display

# Check if CUDA is available and set up GPU environment
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use the first GPU
    torch.cuda.set_device(0)
else:
    print("No GPU available, using CPU")
    # Set up CPU environment for better performance
    os.environ["OMP_NUM_THREADS"] = "4"
    torch.set_num_threads(4)

def clone_voice(folder_number):
    """
    Clone voice using audio and text from the specified folder

    Args:
        folder_number (str): The folder number containing the audio and transcription

    Returns:
        str: Status message
    """
    folder_path = f"/content/{folder_number}"

    # Find the cleaned audio file in the folder
    cleaned_file = None
    for file in os.listdir(folder_path):
        if file.startswith(f"{folder_number}_cleaned"):
            cleaned_file = file
            break

    if not cleaned_file:
        return f"No cleaned audio file found in folder {folder_number}"

    ref_audio_path = f"{folder_path}/{cleaned_file}"

    # Find the transcription file
    transcription_file = f"{folder_path}/{folder_number}_transcription.txt"
    if not os.path.exists(transcription_file):
        return f"No transcription file found in folder {folder_number}"

    # Read the reference text from the transcription file
    with open(transcription_file, "r", encoding="utf-8") as f:
        ref_text = f.read().strip()

    if not ref_text:
        return f"Transcription file is empty in folder {folder_number}"

    # Define the text to generate
    gen_text = input("Enter the text you want to generate with the cloned voice: ")

    # Set output path
    output_path = f"{folder_path}/{folder_number}_cloned_voice.wav"

    # Run F5-TTS CLI with correct parameters and GPU acceleration
    cmd = [
        "f5-tts_infer-cli",
        "--model", "F5TTS_v1_Base",
        "--ref_audio", ref_audio_path,
        "--ref_text", ref_text,
        "--gen_text", gen_text,
        "--output_file", output_path,
        "--device", "cuda" if torch.cuda.is_available() else "cpu"  # Specify GPU device
    ]

    try:
        print("Running voice cloning process...")
        process = subprocess.run(cmd, capture_output=True, text=True)

        if process.returncode != 0:
            print("Error:", process.stderr)
            return f"Voice cloning failed for folder {folder_number}"
        else:
            print(f"Generated audio saved to {output_path}")
            display(Audio(output_path))
            return f"Voice cloning completed for folder {folder_number}"
    except Exception as e:
        return f"An error occurred during voice cloning: {str(e)}"

def get_folder_list():
    """Get a list of numbered folders in the content directory"""
    folders = []
    for item in os.listdir("/content"):
        if os.path.isdir(f"/content/{item}") and item.isdigit():
            folders.append(item)
    return sorted(folders, key=int)

# Main execution
def run_voice_cloning():
    available_folders = get_folder_list()

    if not available_folders:
        print("No numbered folders found. Please run the noise reduction code first.")
        return

    # Print available folders
    print("Available folders:")
    for folder in available_folders:
        print(f"- {folder}")

    folder_number = input("Enter folder number for voice cloning: ")

    if folder_number in available_folders:
        print(f"Processing voice cloning for folder {folder_number}...")
        result = clone_voice(folder_number)
        print(result)
    else:
        print(f"Folder {folder_number} not found.")

# Try to use widgets if available, otherwise use simple input
try:
    import ipywidgets as widgets
    from IPython.display import display

    # Get available folders
    available_folders = get_folder_list()

    if not available_folders:
        print("No numbered folders found. Please run the noise reduction code first.")
    else:
        # Create widgets
        folder_dropdown = widgets.Dropdown(
            options=available_folders,
            description='Select folder:',
            style={'description_width': 'initial'}
        )

        gen_text_input = widgets.Textarea(
            value='I am excited to explore new opportunities in the field of machine learning and natural language processing.',
            placeholder='Enter text to generate with cloned voice',
            description='Generation text:',
            layout={'width': '90%', 'height': '100px'},
            style={'description_width': 'initial'}
        )

        clone_button = widgets.Button(
            description='Clone Voice',
            button_style='primary'
        )

        output = widgets.Output()

        def on_button_click(b):
            with output:
                output.clear_output()
                selected_folder = folder_dropdown.value
                folder_path = f"/content/{selected_folder}"

                # Find the cleaned audio file
                cleaned_file = None
                for file in os.listdir(folder_path):
                    if file.startswith(f"{selected_folder}_cleaned"):
                        cleaned_file = file
                        break

                if not cleaned_file:
                    print(f"No cleaned audio file found in folder {selected_folder}")
                    return

                ref_audio_path = f"{folder_path}/{cleaned_file}"

                # Find the transcription file
                transcription_file = f"{folder_path}/{selected_folder}_transcription.txt"
                if not os.path.exists(transcription_file):
                    print(f"No transcription file found in folder {selected_folder}")
                    return

                # Read the reference text
                with open(transcription_file, "r", encoding="utf-8") as f:
                    ref_text = f.read().strip()

                if not ref_text:
                    print(f"Transcription file is empty in folder {selected_folder}")
                    return

                # Get generation text
                gen_text = gen_text_input.value

                # Set output path
                output_path = f"{folder_path}/{selected_folder}_cloned_voice.wav"

                # Run F5-TTS CLI with GPU acceleration
                cmd = [
                    "f5-tts_infer-cli",
                    "--model", "F5TTS_v1_Base",
                    "--ref_audio", ref_audio_path,
                    "--ref_text", ref_text,
                    "--gen_text", gen_text,
                    "--output_file", output_path,
                    "--device", "cuda" if torch.cuda.is_available() else "cpu"  # Specify GPU device
                ]

                print("Running voice cloning process...")
                process = subprocess.run(cmd, capture_output=True, text=True)

                if process.returncode != 0:
                    print("Error:", process.stderr)
                else:
                    print(f"Generated audio saved to {output_path}")
                    display(Audio(output_path))

        clone_button.on_click(on_button_click)

        # Display the widgets
        display(widgets.VBox([folder_dropdown, gen_text_input, clone_button, output]))

except ImportError:
    # Fallback to non-widget version
    run_voice_cloning()


GPU available: Tesla T4


VBox(children=(Dropdown(description='Select folder:', options=('1',), style=DescriptionStyle(description_width…