In [1]:
!pip install --upgrade pip setuptools wheel




In [4]:
!pip install yt-dlp




In [None]:
!pip install --upgrade pip setuptools wheel
!pip install --upgrade "numba>=0.56" "llvmlite>=0.39"
!pip install git+https://github.com/deezer/spleeter


In [6]:
# ----- Upgrade pip and install dependencies -----
!pip install --upgrade pip setuptools wheel
!pip install -q yt-dlp  ffmpeg-python torch librosa soundfile
!apt-get -qq install ffmpeg

import os
import subprocess
import shutil
import torch
import torch.nn.functional as F
import librosa
import numpy as np
import soundfile as sf

# ----- Helper Functions -----
def download_youtube_audio(youtube_url, output_path="downloaded_song.mp3"):
    """
    Download the audio of a YouTube video using yt-dlp.
    Due to post-processing, the final file may have a double extension.
    This function checks for that and returns the correct file path.
    """
    print("[INFO] Downloading YouTube audio using yt-dlp...")
    from yt_dlp import YoutubeDL
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }
    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])
    # Check for double extension: if output_path doesn't exist, try output_path+".mp3"
    final_path = output_path
    if not os.path.exists(final_path):
        alt_path = output_path + ".mp3"
        if os.path.exists(alt_path):
            final_path = alt_path
    print(f"[INFO] Download completed: {final_path}")
    return final_path

def separate_vocals(audio_file, output_dir="separated"):
    """
    Use Spleeter to separate vocals and instrumental.
    If the expected files (vocals.wav, accompaniment.wav) are not found,
    try a fallback method based on the full basename.
    """
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    print("[INFO] Separating vocals and instrumental with Spleeter...")

    # Use "python3" so that the correct interpreter (with spleeter installed) is used.
    command = f"python3 -m spleeter separate -p spleeter:2stems -o {output_dir} {audio_file}"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Spleeter stdout:")
        print(result.stdout)
        print("Spleeter stderr:")
        print(result.stderr)
        raise RuntimeError("Spleeter command failed with return code " + str(result.returncode))

    # Use the base name from os.path.splitext to determine the expected folder.
    expected_folder = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_file))[0])

    if not os.path.isdir(expected_folder):
        subdirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d))]
        print("Folders in output directory:", subdirs)
        raise RuntimeError("Expected output folder not found in Spleeter output.")

    vocals_path = os.path.join(expected_folder, "vocals.wav")
    instrumental_path = os.path.join(expected_folder, "accompaniment.wav")
    if not os.path.isfile(vocals_path):
        print("Files in expected folder:", os.listdir(expected_folder))
        raise RuntimeError("Vocals file not found. Spleeter separation failed.")
    if not os.path.isfile(instrumental_path):
        print("Files in expected folder:", os.listdir(expected_folder))
        raise RuntimeError("Instrumental file not found. Spleeter separation failed.")

    print(f"[INFO] Separation complete.\n  Vocals: {vocals_path}\n  Instrumental: {instrumental_path}")
    return vocals_path, instrumental_path

def load_autovc_model(autovc_checkpoint="/content/autovc.ckpt", device="cpu"):
    """
    Load a pre-trained AutoVC model from a checkpoint file.
    Adjust this function to match your AutoVC model's loading code.
    """
    if not os.path.exists(autovc_checkpoint):
        raise FileNotFoundError(f"AutoVC checkpoint not found: {autovc_checkpoint}")
    print(f"[INFO] Loading AutoVC model from {autovc_checkpoint} ...")
    checkpoint = torch.load(autovc_checkpoint, map_location=device)
    # Placeholder: adjust this to your model’s actual architecture and loading method.
    model = checkpoint
    print("[INFO] AutoVC model loaded (placeholder).")
    return model

def extract_speaker_embedding(user_voice_path, device="cpu"):
    """
    Extract speaker embedding from your voice sample.
    This is a naive placeholder; for production, use a proper speaker encoder.
    """
    print("[INFO] Extracting speaker embedding from your voice sample...")
    audio, sr = librosa.load(user_voice_path, sr=22050)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr)
    # Naively compute the mean across time as a "fake" embedding.
    speaker_embedding = np.mean(mel, axis=1)
    speaker_embedding_tensor = torch.tensor(speaker_embedding, dtype=torch.float32).unsqueeze(0).to(device)
    print("[INFO] Speaker embedding extraction complete (placeholder).")
    return speaker_embedding_tensor

def wav_to_mel(audio, sr=22050, n_fft=1024, hop_length=256, n_mels=80):
    """
    Convert a waveform to a log-mel spectrogram.
    """
    spectrogram = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length)
    magnitudes, _ = librosa.magphase(spectrogram)
    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels)
    mel_spectrogram = np.dot(mel_basis, magnitudes)
    mel_spectrogram = np.maximum(1e-5, mel_spectrogram)
    log_mel = np.log10(mel_spectrogram)
    return log_mel

def griffin_lim(magnitudes, n_iter=60, n_fft=1024, hop_length=256):
    """
    Griffin-Lim algorithm to invert a spectrogram to waveform.
    """
    angles = np.exp(2j * np.pi * np.random.rand(*magnitudes.shape))
    for i in range(n_iter):
        complex_spec = magnitudes * angles
        signal = librosa.istft(complex_spec, hop_length=hop_length)
        reconstruction = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
        angles = np.exp(1j * np.angle(reconstruction))
    return signal

def mel_to_audio(mel_spectrogram, sr=22050, n_fft=1024, hop_length=256):
    """
    Convert a log-mel spectrogram to audio using naive inversion and Griffin-Lim.
    """
    mel_spectrogram = np.power(10.0, mel_spectrogram)  # Invert the log scale.
    mel_basis = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=mel_spectrogram.shape[0])
    inv_mel_basis = np.linalg.pinv(mel_basis)
    linear_magnitude = np.dot(inv_mel_basis, mel_spectrogram)
    audio = griffin_lim(linear_magnitude, n_iter=60, n_fft=n_fft, hop_length=hop_length)
    return audio

def autovc_inference(model, source_vocals_path, speaker_embedding, device="cpu", output_path="converted_vocals.wav"):
    """
    Run AutoVC inference on the source vocals.
    Adjust the forward pass to match your AutoVC implementation.
    """
    print("[INFO] Running AutoVC inference on the source vocals...")
    source_audio, sr = librosa.load(source_vocals_path, sr=22050)
    source_mel = wav_to_mel(source_audio, sr=sr)
    source_mel_tensor = torch.tensor(source_mel, dtype=torch.float32).unsqueeze(0).to(device)
    with torch.no_grad():
        if isinstance(model, dict) and 'convert' in model:
            converted_mel_tensor = model['convert'](source_mel_tensor, speaker_embedding)
        else:
            # Fallback: simply pass through the source mel spectrogram.
            converted_mel_tensor = source_mel_tensor.clone()
    converted_mel = converted_mel_tensor.squeeze().cpu().numpy()
    converted_audio = mel_to_audio(converted_mel, sr=sr)
    sf.write(output_path, converted_audio, sr)
    print(f"[INFO] AutoVC inference complete. Converted vocals saved to: {output_path}")
    return output_path

def merge_audio_files(vocals_path, instrumental_path, output_path="final_cover.mp3"):
    """
    Merge the converted vocals with the instrumental track using FFmpeg.
    """
    print("[INFO] Merging converted vocals with instrumental...")
    cmd = (
        f"ffmpeg -y -i {instrumental_path} -i {vocals_path} "
        f"-filter_complex '[0:a][1:a]amix=inputs=2:duration=longest' {output_path}"
    )
    subprocess.call(cmd, shell=True)
    print(f"[INFO] Final cover saved to: {output_path}")
    return output_path

# ----- Main Pipeline -----
def main():
    # Prompt for user inputs.
    youtube_url = input("Enter the YouTube URL for the song: ").strip()
    user_voice_path = input("Enter the path to your own voice sample (e.g., /content/my_voice.wav): ").strip()
    autovc_ckpt_path = "/content/autovc.ckpt"  # Ensure this file is in your Colab environment

    # Step 1: Download the song.
    downloaded_song = download_youtube_audio(youtube_url, "downloaded_song.mp3")

    # Step 2: Separate vocals and instrumental.
    vocals_path, instrumental_path = separate_vocals(downloaded_song)

    # Step 3a: Load the AutoVC model.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    autovc_model = load_autovc_model(autovc_ckpt_path, device=device)

    # Step 3b: Extract your speaker embedding.
    speaker_emb = extract_speaker_embedding(user_voice_path, device=device)

    # Step 3c: Run AutoVC inference on the source vocals.
    converted_vocals_path = autovc_inference(
        model=autovc_model,
        source_vocals_path=vocals_path,
        speaker_embedding=speaker_emb,
        device=device,
        output_path="converted_vocals.wav"
    )

    # Step 4: Merge the converted vocals with the instrumental.
    final_output = merge_audio_files(converted_vocals_path, instrumental_path, "final_cover.mp3")
    print(f"[INFO] Your AI-generated cover is ready: {final_output}")

if __name__ == "__main__":
    main()


Enter the YouTube URL for the song: https://www.youtube.com/watch?v=zRtPUIumXcY
Enter the path to your own voice sample (e.g., /content/my_voice.wav): /content/naat-wav.wav
[INFO] Downloading YouTube audio using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=zRtPUIumXcY
[youtube] zRtPUIumXcY: Downloading webpage
[youtube] zRtPUIumXcY: Downloading tv client config
[youtube] zRtPUIumXcY: Downloading player f6e09c70
[youtube] zRtPUIumXcY: Downloading tv player API JSON
[youtube] zRtPUIumXcY: Downloading ios player API JSON
[youtube] zRtPUIumXcY: Downloading m3u8 information
[info] zRtPUIumXcY: Downloading 1 format(s): 251
[download] Destination: downloaded_song.mp3
[download] 100% of    2.72MiB in 00:00:00 at 9.73MiB/s   
[ExtractAudio] Destination: downloaded_song.mp3.mp3
Deleting original file downloaded_song.mp3 (pass -k to keep)
[INFO] Download completed: downloaded_song.mp3.mp3
[INFO] Separating vocals and instrumental with Spleeter...
[INFO] Separation complete.

  checkpoint = torch.load(autovc_checkpoint, map_location=device)


[INFO] AutoVC model loaded (placeholder).
[INFO] Extracting speaker embedding from your voice sample...
[INFO] Speaker embedding extraction complete (placeholder).
[INFO] Running AutoVC inference on the source vocals...
[INFO] AutoVC inference complete. Converted vocals saved to: converted_vocals.wav
[INFO] Merging converted vocals with instrumental...
[INFO] Final cover saved to: final_cover.mp3
[INFO] Your AI-generated cover is ready: final_cover.mp3
