In [1]:
!pip install torch torchaudio transformers librosa numpy soundfile gradio TTS noisereduce
!pip install numpy==1.26.4 pandas==1.5.3 networkx==2.8.8 --force-reinstall

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pandas==1.5.3
  Using cached pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting networkx==2.8.8
  Using cached networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dateutil>=2.8.1 (from pandas==1.5.3)
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==1.5.3)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.1->pandas==1.5.3)
  Using cached six-1.17.0-py2.py3-none-any.whl.metadata (1.7 kB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Using cached pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.0 MB)
Using cached networkx-2.8.8-py3-none-any.whl (2.0 MB)
Using cached python_dateutil-

In [2]:
!pip install TTS noisereduce



In [3]:
!pip install torch torchaudio transformers librosa numpy soundfile gradio tts noisereduce



In [4]:
import torch
import torchaudio
import librosa
import numpy as np
import soundfile as sf
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from TTS.api import TTS  # Correct import for Coqui TTS
from noisereduce import reduce_noise

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load ASR model (Whisper)
asr_model_name = "openai/whisper-base"
print(f"Loading Whisper model: {asr_model_name}")
asr_processor = WhisperProcessor.from_pretrained(asr_model_name)
asr_model = WhisperForConditionalGeneration.from_pretrained(asr_model_name).to(device)
print(f"Model loaded on {device}.")

# Load Emotion Recognition model
emotion_model_name = "bhadresh-savani/distilbert-base-uncased-emotion"
emotion_recognition = pipeline("text-classification", model=emotion_model_name, device=0 if device == "cuda" else -1)

# Load TTS model (Coqui TTS with your_tts for voice cloning)
tts_model = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=True, gpu=(device == "cuda"))
print(f"TTS model loaded: {tts_model.model_name}")

# Emotion-to-audio parameter mapping (refined values for testing)
emotion_params = {
    "joy": {"pitch_shift": 1.5, "speed": 1.1, "amplitude_factor": 1.3},  # Moderate rise, slight speedup, moderate loudness
    "sadness": {"pitch_shift": -1.5, "speed": 0.9, "amplitude_factor": 0.8},  # Moderate drop, slight slowdown, softer
    "anger": {"pitch_shift": 0.8, "speed": 1.15, "amplitude_factor": 1.6},  # Slight rise, moderate speedup, louder
    "fear": {"pitch_shift": -0.8, "speed": 1.2, "amplitude_factor": 1.1},  # Slight drop, faster, moderate loudness
    "neutral": {"pitch_shift": 0.0, "speed": 1.0, "amplitude_factor": 1.0}  # No change
}

def voice_cloning(audio_file: str, text_input: str, desired_tone: str):
    try:
        print(f"Received file: {audio_file}")
        print(f"Text input: {text_input}")
        print(f"Desired tone: {desired_tone}")

        # Validate inputs
        if not audio_file or not text_input:
            raise ValueError("Both audio file and text input are required.")

        # Load and preprocess audio
        waveform, sr = torchaudio.load(audio_file)
        if sr != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
            waveform = resampler(waveform)
            sr = 16000
        if waveform.shape[0] > 1:  # Convert stereo to mono
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        audio = waveform.squeeze().numpy().astype(np.float32)

        # Step 1: Transcribe the input audio (for emotion detection)
        inputs = asr_processor(audio, sampling_rate=sr, return_tensors="pt", return_attention_mask=True)
        input_features = inputs.input_features.to(device)
        attention_mask = inputs.attention_mask.to(device) if inputs.get("attention_mask") is not None else None
        print(f"Input features shape: {input_features.shape}")
        print(f"Attention mask shape: {attention_mask.shape if attention_mask is not None else 'None'}")
        with torch.no_grad():
            print("Attempting transcription...")
            predicted_ids = asr_model.generate(input_features, attention_mask=attention_mask, task="transcribe")
            print("Transcription generation completed.")
        transcription = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        print(f"Transcription: {transcription}")

        # Step 2: Detect the tone (emotion) of the input audio based on transcription
        input_emotion = emotion_recognition(transcription)[0]['label']
        print(f"Detected input tone: {input_emotion}")

        # Step 3: Synthesize the user-provided text with the desired tone using voice cloning
        params = emotion_params.get(desired_tone.lower(), emotion_params["neutral"])
        tts_audio = tts_model.tts(
            text=text_input,
            speaker_wav=audio_file,  # Clone the voice from the input audio
            speaker_embedding=True,  # Use speaker embedding for cloning
            language="en"  # Required for multilingual model
        )
        tts_audio = np.array(tts_audio, dtype=np.float32).squeeze()

        # Step 4: Apply emotional adjustments
        if len(tts_audio.shape) > 1:
            tts_audio = tts_audio.squeeze()
        print(f"Applying emotional adjustments: pitch={params['pitch_shift']}, speed={params['speed']}, amplitude={params['amplitude_factor']}")

        # Pitch shift
        if params["pitch_shift"] != 0.0:
            tts_audio = librosa.effects.pitch_shift(tts_audio, sr=sr, n_steps=params["pitch_shift"])

        # Speed adjustment
        tts_audio = librosa.effects.time_stretch(tts_audio, rate=1.0 / params["speed"])

        # Amplitude adjustment
        tts_audio = tts_audio * params["amplitude_factor"]
        tts_audio = np.clip(tts_audio, -1.0, 1.0)  # Prevent clipping

        # Step 5: Enhance audio quality with noise reduction
        tts_audio = reduce_noise(tts_audio, sr=sr)

        # Save the output
        output_filename = "cloned_voice.wav"
        sf.write(output_filename, tts_audio, samplerate=16000, format="WAV")
        print(f"Output saved as: {output_filename}")

        return transcription, input_emotion, text_input, desired_tone, output_filename

    except Exception as e:
        print(f"Error: {str(e)}")
        return f"Error: {str(e)}", "N/A", text_input, desired_tone, None

# Gradio Interface
demo = gr.Interface(
    fn=voice_cloning,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio Sample"),
        gr.Textbox(label="Enter Text to Synthesize"),
        gr.Dropdown(
            choices=["Joy", "Sadness", "Anger", "Fear", "Neutral"],
            label="Select Desired Tone",
            value="Neutral"
        )
    ],
    outputs=[
        gr.Textbox(label="Transcription of Input Audio"),
        gr.Textbox(label="Detected Tone of Input Audio"),
        gr.Textbox(label="Input Text"),
        gr.Textbox(label="Selected Tone"),
        gr.Audio(label="Cloned Voice Output")
    ],
    title="AI Voice Cloning with Tone Customization",
    description="Upload an audio sample and enter text to synthesize a cloned voice with a selected tone. The system will also detect the tone of the input audio."
)

# Launch the interface
demo.launch(share=True, debug=True)

Using device: cuda
Loading Whisper model: openai/whisper-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

Model loaded on cuda.


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts


 99%|█████████▊| 419M/425M [00:05<00:00, 91.9MiB/s]

 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | 

100%|██████████| 425M/425M [00:19<00:00, 91.9MiB/s]

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c8fa401b40409d4db3.gradio.live


